Initial Version of HEVC decoder
Compliant to reference software HM11.0 onwards
Bug: 14571712
Change-Id: I8af25c1221cc6ab70440141c4d9b48c1ac69696a
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..4668c52
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,6 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+# decoder
+include $(LOCAL_PATH)/decoder.mk
+
diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_APACHE2
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..e960962
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,14 @@
+Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at:
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
diff --git a/common/arm/ihevc_deblk_chroma_horz.s b/common/arm/ihevc_deblk_chroma_horz.s
new file mode 100644
index 0000000..34422ff
--- /dev/null
+++ b/common/arm/ihevc_deblk_chroma_horz.s
@@ -0,0 +1,148 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/*******************************************************************************
+@* @file
+@* ihevc_deblk_luma_horz.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_horz_a9q
+
+gai4_ihevc_qp_table_addr:
+.long gai4_ihevc_qp_table - ulbl1 - 8
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table - ulbl2 - 8
+
+.type ihevc_deblk_chroma_horz_a9q, %function
+
+ihevc_deblk_chroma_horz_a9q:
+ push {r4-r12,lr}
+ sub r12,r0,r1
+ vld1.8 {d0},[r0]
+ sub r5,r12,r1
+ add r6,r0,r1
+ add r1,r2,r3
+ vmovl.u8 q0,d0
+ ldr r10,[sp,#0x28]
+ vld1.8 {d2},[r12]
+ add r2,r1,#1
+ ldr r4,[sp,#0x30]
+ vld1.8 {d4},[r5]
+ ldr r8,[sp,#0x34]
+ vld1.8 {d16},[r6]
+ ldr r9,[sp,#0x38]
+ adds r1,r10,r2,asr #1
+ vmovl.u8 q1,d2
+ ldr r7,[sp,#0x2c]
+ ldr r3,gai4_ihevc_qp_table_addr
+ulbl1:
+ add r3, r3, pc
+ bmi l1.3312
+ cmp r1,#0x39
+ ldrle r1,[r3,r1,lsl #2]
+ subgt r1,r1,#6
+l1.3312:
+ adds r2,r7,r2,asr #1
+ vmovl.u8 q2,d4
+ bmi l1.3332
+ cmp r2,#0x39
+ ldrle r2,[r3,r2,lsl #2]
+ subgt r2,r2,#6
+l1.3332:
+ add r1,r1,r4,lsl #1
+ vsub.i16 q3,q0,q1
+ add r3,r1,#2
+ cmp r3,#0x35
+ movgt r1,#0x35
+ vshl.i16 q3,q3,#2
+ vmovl.u8 q8,d16
+ bgt l1.3368
+ adds r3,r1,#2
+ addpl r1,r1,#2
+ movmi r1,#0
+l1.3368:
+ ldr r3,gai4_ihevc_tc_table_addr
+ulbl2:
+ add r3, r3, pc
+ vadd.i16 q2,q3,q2
+ add r2,r2,r4,lsl #1
+ vsub.i16 q3,q2,q8
+ add r4,r2,#2
+ ldr r1,[r3,r1,lsl #2]
+ cmp r4,#0x35
+ movgt r2,#0x35
+ bgt l1.3412
+ adds r4,r2,#2
+ addpl r2,r2,#2
+ movmi r2,#0
+l1.3412:
+
+
+ ldr r2,[r3,r2,lsl #2]
+ cmp r8,#0
+ vdup.16 q8,r2
+ vdup.16 q2,r1
+ rsb r1,r1,#0
+ vrshr.s16 q3,q3,#3
+ vdup.16 q9,r1
+ rsb r1,r2,#0
+ vzip.16 q2,q8
+ vdup.16 q10,r1
+
+ vzip.16 q9,q10
+
+ vmin.s16 q8,q3,q2
+ vmax.s16 q2,q9,q8
+ vadd.i16 q1,q1,q2
+ vsub.i16 q0,q0,q2
+ vqmovun.s16 d2,q1
+ vqmovun.s16 d0,q0
+ beq l1.3528
+ vst1.8 {d2},[r12]
+l1.3528:
+ cmp r9,#0
+ beq l1.3540
+ vst1.8 {d0},[r0]
+l1.3540:
+ pop {r4-r12,pc}
+
+
diff --git a/common/arm/ihevc_deblk_chroma_vert.s b/common/arm/ihevc_deblk_chroma_vert.s
new file mode 100644
index 0000000..4cb305f
--- /dev/null
+++ b/common/arm/ihevc_deblk_chroma_vert.s
@@ -0,0 +1,163 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@* ihevc_deblk_luma_vert.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_vert_a9q
+
+gai4_ihevc_qp_table_addr:
+.long gai4_ihevc_qp_table - ulbl1 - 8
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table - ulbl2 - 8
+
+.type ihevc_deblk_chroma_vert_a9q, %function
+
+ihevc_deblk_chroma_vert_a9q:
+ push {r4-r12,lr}
+ sub r8,r0,#4
+ add r2,r2,r3
+ vld1.8 {d5},[r8],r1
+ add r2,r2,#1
+ vld1.8 {d17},[r8],r1
+ ldr r7,[sp,#0x28]
+ vld1.8 {d16},[r8],r1
+ ldr r4,[sp,#0x38]
+ vld1.8 {d4},[r8]
+ ldr r5,[sp,#0x30]
+ vtrn.8 d5,d17
+ adds r3,r7,r2,asr #1
+ vtrn.8 d16,d4
+ ldr r7,gai4_ihevc_qp_table_addr
+ulbl1:
+ add r7,r7,pc
+ ldr r12,[sp,#0x34]
+ ldr r6,[sp,#0x2c]
+ bmi l1.2944
+ cmp r3,#0x39
+ ldrle r3,[r7,r3,lsl #2]
+ subgt r3,r3,#6
+l1.2944:
+ vtrn.16 d5,d16
+ adds r2,r6,r2,asr #1
+ vtrn.16 d17,d4
+ bmi l1.2964
+ cmp r2,#0x39
+ ldrle r2,[r7,r2,lsl #2]
+ subgt r2,r2,#6
+l1.2964:
+ vtrn.32 d5,d17
+ add r3,r3,r5,lsl #1
+ vtrn.32 d16,d4
+ add r6,r3,#2
+ vmovl.u8 q9,d17
+ cmp r6,#0x35
+ movgt r3,#0x35
+ bgt l1.2996
+ adds r6,r3,#2
+ addpl r3,r3,#2
+ movmi r3,#0
+l1.2996:
+ vsubl.u8 q0,d17,d16
+ ldr r6,gai4_ihevc_tc_table_addr
+ulbl2:
+ add r6,r6,pc
+ vshl.i16 q0,q0,#2
+ add r2,r2,r5,lsl #1
+ add r5,r2,#2
+ vaddw.u8 q0,q0,d5
+ cmp r5,#0x35
+ ldr r3,[r6,r3,lsl #2]
+ vsubw.u8 q2,q0,d4
+ movgt r2,#0x35
+ bgt l1.3036
+ adds r5,r2,#2
+ addpl r2,r2,#2
+ movmi r2,#0
+l1.3036:
+
+
+ vrshr.s16 q3,q2,#3
+ vdup.16 d2,r3
+ ldr r2,[r6,r2,lsl #2]
+ rsb r3,r3,#0
+ cmp r12,#0
+ vdup.16 d3,r2
+ rsb r2,r2,#0
+ vdup.16 d30,r3
+ vdup.16 d31,r2
+
+
+ vmin.s16 q2,q3,q1
+ vmax.s16 q1,q15,q2
+
+ vmovl.u8 q3,d16
+
+ vadd.i16 q0,q3,q1
+ vsub.i16 q1,q9,q1
+ vqmovun.s16 d0,q0
+ sub r2,r0,#2
+ vqmovun.s16 d1,q1
+ vtrn.32 d0,d1
+ vtrn.8 d0,d1
+ beq l1.3204
+
+ vst1.16 {d0[0]},[r2],r1
+ vst1.16 {d1[0]},[r2],r1
+ vst1.16 {d0[1]},[r2],r1
+ vst1.16 {d1[1]},[r2]
+l1.3204:
+ cmp r4,#0
+ beq l1.3228
+ vst1.16 {d0[2]},[r0],r1
+ vst1.16 {d1[2]},[r0],r1
+ vst1.16 {d0[3]},[r0],r1
+ vst1.16 {d1[3]},[r0]
+l1.3228:
+ pop {r4-r12,pc}
+
+
+
diff --git a/common/arm/ihevc_deblk_luma_horz.s b/common/arm/ihevc_deblk_luma_horz.s
new file mode 100644
index 0000000..b12ceb9
--- /dev/null
+++ b/common/arm/ihevc_deblk_luma_horz.s
@@ -0,0 +1,543 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/*******************************************************************************
+@* @file
+@* ihevc_deblk_luma_vert.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+.globl ihevc_deblk_luma_horz_a9q
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table - ulbl1 - 8
+
+gai4_ihevc_beta_table_addr:
+.long gai4_ihevc_beta_table - ulbl2 - 8
+
+.type ihevc_deblk_luma_horz_a9q, %function
+
+ihevc_deblk_luma_horz_a9q:
+ stmfd sp!, {r3-r12,lr}
+ ldr r4,[sp,#0x2c]
+ ldr r5,[sp,#0x30]
+
+ add r3,r3,r4
+ add r3,r3,#1
+ ldr r6, [sp,#0x34]
+ asr r3,r3,#1
+ add r7,r3,r5,lsl #1
+ add r3,r3,r6,lsl #1
+ cmp r7,#0x33
+ movgt r7,#0x33
+ bgt l1.1532
+ cmp r7,#0x0
+ movlt r7,#0x0 @ r7 has the beta_index value
+l1.1532:
+ @ bic r2,r2,#1
+ asr r2,r2,#1
+
+ add r3,r3,r2,lsl #1
+ cmp r3,#0x35
+ movgt r3,#0x35
+ bgt l1.1564
+ cmp r3,#0x0
+ movlt r3,#0x0 @ r3 has the tc_index value
+
+ @ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
+ @ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
+ @ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
+
+l1.1564:
+ ldr r2,gai4_ihevc_beta_table_addr
+ulbl2:
+ add r2,r2,pc
+ ldr r4,gai4_ihevc_tc_table_addr
+ulbl1:
+ add r4,r4,pc
+
+ ldr r5,[r2,r7,lsl #2] @ beta
+ ldr r6,[r4,r3,lsl #2] @ tc
+
+
+
+ cmp r6,#0
+ beq l1.2404
+ vmov.i16 d0,#0x2
+ lsl r7,r6,#1
+ add r14,r1,r1,lsl #1
+ ldr r8,[r0,-r14] @ -3 value
+ vdup.8 d1,r7
+ ldr r10,[r0,-r1,lsl #1] @-2 value
+ vdup.32 d23,r8 @ -3 value
+ ldr r11,[r0,-r1] @-1 value
+ vdup.32 d24,r10 @ -2 value
+ and r8,#0xff
+ ldr r12,[r0,#0] @ 0 value
+ vdup.32 d25, r11 @-1 value
+ and r10,#0xff
+ ldr r9,[r0,r1] @ 1 value
+ vdup.32 d26,r12 @ 0 value
+ and r11,#0xff
+ ldr r2,[r0,r1,lsl #1] @ 2 value
+ vdup.32 d27,r9 @ 1value
+ and r12,#0xff
+ vdup.32 d28,r2 @ 2 value
+ and r9,#0xff
+ and r2,#0xff
+
+ add r12,r12,r2
+ subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9
+ rsbmi r9,r9,#0
+ @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
+
+ add r8,r8,r11
+ subs r8,r8,r10,lsl #1
+ rsbmi r8,r8,#0 @ dp0 value is stored in r8
+ @ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
+
+
+
+ add r3,r1,r1,lsl #1
+ add r14,r0,#3
+
+
+ ldrb r2,[r14,-r3] @ -2 value
+ ldrb r10,[r14,-r1,lsl #1] @ -2 value
+ ldrb r11,[r14,-r1] @ -1 value
+ ldrb r12,[r14,#0] @ 0 value
+ ldrb r3,[r14,r1] @ 1 value
+ ldrb r4,[r14,r1,lsl #1] @ 2 value
+
+
+ add r12,r12,r4
+ subs r12,r12,r3,lsl #1 @ dq3value is stored in r12
+ rsbmi r12,r12,#0
+ @ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
+
+
+ add r2,r2,r11
+ subs r11,r2,r10,lsl #1
+ rsbmi r11,r11,#0 @ dp3 value is stored in r8
+ @ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
+
+
+
+ add r3,r8,r9 @ r3 has the d0 value
+ add r4,r11,r12 @ r4 has the d3 value
+
+
+ @ d0 = dp0 + dq0@
+ @ d3 = dp3 + dq3@
+
+ add r14,r8,r11 @ r13 has the value dp
+ add r12,r12,r9 @ r12 has the value dq
+ @ dp = dp0 + dp3@
+ @ dq = dq0 + dq3@
+
+ add r11, r3, r4 @ r3 has the value d
+
+ @ d = d0 + d3@
+
+
+ cmp r11,r5
+ bge l1.2404
+
+ @ if(d < beta)
+
+
+ @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
+
+ @ registers for use: r2,r7,r8,r9,r10,
+
+ asr r10,r5,#2
+ vqadd.u8 d30,d26,d1
+ cmp r10,r3,lsl #1
+ vqsub.u8 d31,d26,d1
+ ble l1.1840
+ add r10,r1,r1,lsl #1
+ vaddl.u8 q3,d25,d26
+ ldr r2,[r0,-r1,lsl #2] @ has the -4 value
+ ldrb r7,[r0,-r1] @ has the -1 value
+ vdup.32 d22,r2 @ -4 value
+ vaddw.u8 q4,q3,d27
+ ldrb r3,[r0,#0] @ r4 has the 0 value
+ vqadd.u8 d16,d27,d1
+ and r2,#0xff
+ vmul.i16 q6,q4,d0[0]
+ ldr r8,[r0,r10] @ has the 3 value
+ vaddl.u8 q5,d24,d28
+ subs r2,r2,r7
+ vqsub.u8 d17,d27,d1
+ vdup.32 d29,r8 @ 3 value
+ and r8,#0xff
+ vadd.i16 q6,q6,q5
+ rsbmi r2,r2,#0
+ vrshrn.i16 d20,q6,#3
+ subs r8,r8,r3
+ rsbmi r8,r8,#0
+ vmin.u8 d18,d20,d30
+ add r8,r8,r2
+
+ cmp r8,r5,asr #3
+ bge l1.1840
+ vaddw.u8 q7,q4,d28
+ subs r7,r3,r7
+ vmax.u8 d4,d18,d31
+ rsbmi r7,r7,#0
+ vqadd.u8 d30,d28,d1
+ mov r10,#5
+ vrshrn.i16 d21,q7,#2
+ mul r10,r10,r6
+ vqsub.u8 d31,d28,d1
+ add r10,#1
+ cmp r7,r10,asr #1
+ vmin.u8 d18,d21,d16
+ bge l1.1840
+
+
+ @ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
+ @ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+ vmax.u8 d5,d18,d17
+ asr r10,r5,#2
+ vaddl.u8 q8,d29,d28
+ cmp r10,r4,lsl #1
+ ble l1.1840
+
+ add r10,r1,r1,lsl #1
+ vmul.i16 q8,q8,d0[0]
+ add r4,r0,#3
+
+
+ ldrb r2,[r4,-r1,lsl #2]
+ vadd.i16 q8,q8,q7
+ ldrb r7,[r4,-r1]
+ vrshrn.i16 d19,q8,#3
+ ldrb r3,[r4,#0]
+ ldrb r8,[r4,r10]
+ @ ubfx r7,r2,#24,#8 @ has the -1 value
+ @ and r2,#0xff @ has the -4 value
+ @ ubfx r8,r3,#24,#8 @ has the 3 value
+ @ and r3,#0xff @ r4 has the 0 value
+
+
+
+ subs r8,r8,r3
+ vmin.u8 d18,d19,d30
+ rsbmi r8,r8,#0
+ vaddl.u8 q3,d25,d24
+ subs r2,r2,r7
+ vmax.u8 d3,d18,d31
+ rsbmi r2,r2,#0
+ vaddw.u8 q4,q3,d26
+ add r8,r8,r2
+ vqadd.u8 d30,d25,d1
+ cmp r8,r5,asr #3
+ vqsub.u8 d31,d25,d1
+ bge l1.1840
+ vmul.i16 q6,q4,d0[0]
+ subs r7,r3,r7
+ vqadd.u8 d16,d24,d1
+ rsbmi r7,r7,#0
+ vaddl.u8 q5,d23,d27
+ mov r10,#5
+ vqsub.u8 d17,d24,d1
+ mul r10,r10,r6
+ vadd.i16 q6,q6,q5
+ add r10,#1
+ vrshrn.i16 d20,q6,#3
+ cmp r7,r10,asr #1
+ vaddw.u8 q7,q4,d23
+ bge l1.1840
+ vmin.u8 d18,d20,d30
+ mov r2,#2
+ vqadd.u8 d30,d23,d1
+ ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ vmax.u8 d2,d18,d31
+ ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ vrshrn.i16 d21,q7,#2
+ b end_dep_deq_decision_horz
+ @ r2 has the value of de
+ @ r6 has teh value of tc
+ @ r5 has the value of beta
+ @ r14 has the value of dp
+ @ r12 has the value of dq
+ @ r0 has the value of source address
+ @ r1 has the src stride
+
+l1.1840:
+ mov r2,#1
+
+ mov r11,r5
+ ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+
+ cmp r6,#1
+ moveq r9,#0
+ moveq r10,#0
+ beq end_dep_deq_decision_horz
+
+ and r7,r4,r5
+ cmp r7,#1
+ beq both_flags_set_horz
+ cmp r4,#0
+ beq set_flag_dep_zero_horz
+
+
+ add r8,r11,r11,asr #1
+ mov r10,#0
+ asr r8,#3
+ cmp r8,r14
+ movgt r9,#1
+ movle r9,#0
+ b end_dep_deq_decision_horz
+set_flag_dep_zero_horz:
+
+ add r8,r11,r11,asr #1
+ mov r9,#0
+ asr r8,#3
+ cmp r8,r12
+ movgt r10,#1
+ movle r10,#0
+ b end_dep_deq_decision_horz
+
+both_flags_set_horz:
+ add r8,r11,r11,asr #1
+ asr r8,#3
+ cmp r8,r14
+ movgt r9,#1
+ movle r9,#0
+ cmp r8,r12
+ movgt r10,#1
+ movle r10,#0
+end_dep_deq_decision_horz:
+
+ @r0=source address
+ @r1=stride
+ @ r2 =de
+ @ r4=flag p
+ @r5= flag q
+ @r6 =tc
+ @ r9 =dep
+ @ r10=deq
+
+
+
+ @ add r14,r1,r1,lsl #1
+ @ lsl r7,r6,#1
+ @ vdup.8 d1,r7
+ @ vmov.i16 d0,#0x2
+ vmin.u8 d18,d21,d16
+ cmp r2,#1
+ vqsub.u8 d31,d23,d1
+ beq l1.2408
+ vaddl.u8 q4,d23,d22
+ cmp r5,#1
+
+ bne strong_filtering_p
+
+strong_filtering_q:
+ mov r12,r0
+ vst1.32 d4[0],[r12],r1
+ vst1.32 d5[0],[r12],r1
+ vst1.32 d3[0],[r12]
+ cmp r4,#1
+ bne l1.2404
+strong_filtering_p:
+ vmax.u8 d5,d18,d17
+ mov r12,r0
+ vmul.i16 q4,q4,d0[0]
+ rsb r11,r1,#0
+ vadd.i16 q8,q4,q7
+ add r12,r12,r11
+ vrshrn.i16 d19,q8,#3
+ vst1.32 d2[0],[r12],r11
+ vmin.u8 d18,d19,d30
+ vst1.32 d5[0],[r12],r11
+ vmax.u8 d3,d18,d31
+ vst1.32 d3[0],[r12]
+
+l1.2404:
+ ldmfd sp!, {r3-r12,pc}
+
+ @ r4=flag p
+ @r5= flag q
+ @r6 =tc
+ @ r9 =dep
+ @ r10=deq
+
+
+ @ d22 -4 value
+
+ @d23 @ -3 value
+
+ @ vdup.32 d24,r11 @ -2 value
+
+ @ vdup.32 d25, r11 @-1 value
+
+ @ vdup.32 d26,r11 @ 0 value
+
+ @ vdup.32 d27,r11 @ 1value
+
+ @ vdup.32 d28,r11 @ 2 value
+
+ @ vdup.32 d29,r11 @ 3 value
+
+l1.2408:
+
+ vmov.i16 d0,#0x9
+
+ vsubl.u8 q5,d26,d25
+
+ vmul.i16 q5,q5,d0[0]
+
+ vmov.i16 d0,#0x3
+
+ vsubl.u8 q6,d27,d24
+ vmul.i16 q6,q6,d0[0]
+
+
+ vdup.8 d30,r6 @ duplicating the +tc value
+
+ rsb r12,r6,#0
+ vdup.8 d31,r12 @ duplicating the -tc value
+
+
+
+ vsub.i16 q5,q5,q6
+
+
+
+ vrshr.s16 q5,q5,#4
+ @ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
+
+ vabs.s16 q4,q5
+ vmovn.i16 d9,q4
+ @ storing the absolute values of delta in d9
+
+ vqmovn.s16 d10,q5
+ @ storing the clipped values of delta in d16
+
+
+ vmin.s8 d11,d10,d30
+ vmax.s8 d8,d31,d11 @ d8 has the value delta = clip3(delta, -tc, tc)@
+
+
+ vmovl.u8 q3,d25
+
+ vaddw.s8 q2,q3,d8
+
+ vqmovun.s16 d12,q2
+ vmovl.u8 q3,d26
+ vsubw.s8 q2,q3,d8
+ vqmovun.s16 d13,q2
+
+
+ mov r11,#0xa
+ mul r12,r11,r6
+ vdup.8 d2,r12 @ d2 has the 10*tc value
+ vmov d18,d24
+ vdup.8 d0,r6
+ vshr.s8 d0,#1
+ vneg.s8 d1,d0
+
+ cmp r4,#1
+ bne l1.2724
+ cmp r9,#1
+ bne l1.2700
+
+ @ d12 and d13 have the value temp_p0 and temp_q0
+ vaddl.u8 q7,d23,d25
+ vrshrn.u16 d14,q7,#1
+ vsubl.u8 q7,d14,d24
+ vaddw.s8 q7,q7,d8
+ vqshrn.s16 d14,q7,#1
+ vmin.s8 d15,d14,d0
+ vmax.s8 d14,d1,d15
+
+ @ d14 has the delta p value
+ vmovl.u8 q8,d24
+ vaddw.s8 q8,q8,d14
+ vqmovun.s16 d14,q8
+
+ @ d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
+ vcge.u8 d18,d9,d2
+ vbsl d18,d24,d14
+
+l1.2700:
+ mov r12,r0
+ rsb r11,r1,#0
+ add r12,r11
+ vcge.u8 d19,d9,d2
+ vbsl d19,d25,d12
+ vst1.32 {d19[0]},[r12],r11
+ vst1.32 {d18[0]},[r12]
+l1.2724:
+ cmp r5,#1
+ bne l1.2404
+ cmp r10,#1
+ vmov d18, d27
+ bne l1.2852
+
+ vaddl.u8 q7,d26,d28
+ vrshrn.u16 d14,q7,#1
+ vsubl.u8 q7,d14,d27
+ vsubw.s8 q7,q7,d8
+ vqshrn.s16 d14,q7,#1
+ vmin.s8 d15,d14,d0
+ vmax.s8 d14,d1,d15
+@ d14 has the delta p value
+ vmovl.u8 q8,d27
+ vaddw.s8 q8,q8,d14
+ vqmovun.s16 d14,q8
+ vcge.u8 d18,d9,d2
+ vbsl d18,d27,d14
+l1.2852:
+ mov r12,r0
+ vcge.u8 d19,d9,d2
+ vbsl d19,d26,d13
+ vst1.32 {d19[0]},[r12],r1
+ vst1.32 {d18[0]},[r12]
+ ldmfd sp!, {r3-r12,r15}
+
+
+
diff --git a/common/arm/ihevc_deblk_luma_vert.s b/common/arm/ihevc_deblk_luma_vert.s
new file mode 100644
index 0000000..ee247cc
--- /dev/null
+++ b/common/arm/ihevc_deblk_luma_vert.s
@@ -0,0 +1,593 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@* ihevc_deblk_luma_vert.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* anand s
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+
+.text
+.align 4
+
+
+
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+
+.globl ihevc_deblk_luma_vert_a9q
+
+gai4_ihevc_tc_table_addr:
+.long gai4_ihevc_tc_table - ulbl1 - 8
+
+gai4_ihevc_beta_table_addr:
+.long gai4_ihevc_beta_table - ulbl2 - 8
+
+.type ihevc_deblk_luma_vert_a9q, %function
+
+ihevc_deblk_luma_vert_a9q:
+
+ push {r3-r12,lr}
+ ldr r4,[sp,#0x2c]
+ ldr r5,[sp,#0x30]
+
+ add r3,r3,r4
+ add r3,r3,#1
+ ldr r6, [sp,#0x34]
+ asr r3,r3,#1
+ add r7,r3,r5,lsl #1
+ add r3,r3,r6,lsl #1
+ cmp r7,#0x33
+ movgt r7,#0x33
+ bgt l1.56
+ cmp r7,#0x0
+ movlt r7,#0x0 @ r7 has the beta_index value
+l1.56:
+
+@ bic r2,r2,#1
+ asr r2,r2,#1
+
+ add r3,r3,r2,lsl #1
+ cmp r3,#0x35
+ movgt r3,#0x35
+ bgt l1.88
+ cmp r3,#0x0
+ movlt r3,#0x0 @ r3 has the tc_index value
+
+@ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
+@ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
+@ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
+
+l1.88:
+ ldr r2,gai4_ihevc_beta_table_addr
+ulbl2:
+ add r2,r2,pc
+ vmov.i8 d18,#0x2
+ ldr r4,gai4_ihevc_tc_table_addr
+ulbl1:
+ add r4,r4,pc
+
+ ldr r5,[r2,r7,lsl #2] @ beta
+ vmov.i16 q8,#0x2
+ ldr r6,[r4,r3,lsl #2] @ tc
+ lsl r8,r6,#1
+ cmp r6,#0
+ vdup.8 d19,r8
+ sub r7,r0,#4
+ vmov.i8 d23,#0x3
+ beq l1.964
+
+
+ vld1.8 {d24},[r7],r1
+ ldrb r8,[r0,#-3] @ -3 value
+ vld1.8 {d1},[r7],r1
+ ldrb r10,[r0,#-2] @-2 value
+ vld1.8 {d2},[r7],r1
+ ldrb r11,[r0,#-1] @-1 value
+ vld1.8 {d0},[r7]
+ ldrb r12,[r0,#0] @ 0 value
+ ldrb r9,[r0,#1] @ 1 value
+ vtrn.8 d24,d1
+ ldrb r2,[r0,#2] @ 2 value
+ vtrn.8 d2,d0
+ add r12,r12,r2
+ subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9
+ rsbmi r9,r9,#0
+@dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
+ vtrn.16 d24,d2
+ add r8,r8,r11
+ vtrn.16 d1,d0
+ subs r8,r8,r10,lsl #1
+ rsbmi r8,r8,#0 @ dp0 value is stored in r8
+@ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
+
+
+
+ add r14,r1,r1,lsl #1
+ add r14,r0,r14
+
+ vdup.32 d4,d24[1]
+ ldrb r2,[r14,#-3] @ -2 value
+ vdup.32 d7,d2[1]
+ ldrb r10,[r14,#-2] @ -2 value
+ vdup.32 d3,d2[0]
+ ldrb r11,[r14,#-1] @ -1 value
+ vdup.32 d5,d1[1]
+ ldrb r12,[r14,#0] @ 0 value
+ vdup.32 d6,d1[0]
+ ldrb r3,[r14,#1] @ 1 value
+ vdup.32 d2,d0[0]
+ ldrb r4,[r14,#2] @ 2 value
+
+
+ add r12,r12,r4
+ subs r12,r12,r3,lsl #1 @ dq3value is stored in r12
+ rsbmi r12,r12,#0
+@ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
+
+
+ add r2,r2,r11
+ subs r11,r2,r10,lsl #1
+ rsbmi r11,r11,#0 @ dp3 value is stored in r8
+@ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
+
+
+
+ add r3,r8,r9 @ r3 has the d0 value
+ add r4,r11,r12 @ r4 has the d3 value
+
+
+@ d0 = dp0 + dq0@
+@ d3 = dp3 + dq3@
+
+ add r14,r8,r11 @ r13 has the value dp
+ add r12,r12,r9 @ r12 has the value dq
+@ dp = dp0 + dp3@
+@ dq = dq0 + dq3@
+
+ add r11, r3, r4 @ r3 has the value d
+
+@ d = d0 + d3@
+
+
+ cmp r11,r5
+ vdup.32 d22,d0[1]
+ bge l1.964
+
+@ if(d < beta)
+
+
+ @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
+
+ @ registers for use: r2,r7,r8,r9,r10,
+ vqsub.u8 d30,d7,d19
+ asr r10,r5,#2
+ vqadd.u8 d31,d7,d19
+ cmp r10,r3,lsl #1
+ vaddl.u8 q0,d5,d4
+ ble l1.336
+
+ ldrb r2,[r0,#-4]
+ vaddw.u8 q0,q0,d2
+ ldrb r7,[r0,#-1]
+ vmull.u8 q10,d7,d23
+ ldrb r3,[r0,#0]
+ vmlal.u8 q10,d22,d18
+ ldrb r8,[r0,#3]
+@ ubfx r7,r2,#24,#8 @ has the -1 value
+@ and r2,#0xff @ has the -4 value
+@ ubfx r8,r3,#24,#8 @ has the 3 value
+@ and r3,#0xff @ r4 has the 0 value
+
+ vadd.i16 q10,q10,q0
+ subs r8,r8,r3
+ vrshrn.i16 d22,q10,#3
+ rsbmi r8,r8,#0
+ subs r2,r2,r7
+ vmin.u8 d21,d22,d31
+ rsbmi r2,r2,#0
+ vmax.u8 d22,d21,d30
+ add r8,r8,r2
+ vaddl.u8 q10,d7,d3
+ cmp r8,r5,asr #3
+ vmla.i16 q10,q0,q8
+ bge l1.336
+ vaddw.u8 q0,q0,d7
+ subs r7,r3,r7
+ vrshrn.i16 d20,q10,#3
+ rsbmi r7,r7,#0
+ vrshrn.i16 d0,q0,#2
+ mov r10,#5
+ vqadd.u8 d30,d5,d19
+ mul r10,r10,r6
+ vqsub.u8 d31,d5,d19
+ add r10,#1
+ cmp r7,r10,asr #1
+ bge l1.336
+
+
+@ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
+@ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+
+ asr r10,r5,#2
+ vqsub.u8 d25,d4,d19
+ cmp r10,r4,lsl #1
+ vqadd.u8 d21,d4,d19
+ ble l1.336
+ vmin.u8 d26,d20,d21
+ add r4,r1,r1,lsl #1
+ add r4,r4,r0
+ vmax.u8 d20,d26,d25
+ ldrb r2,[r4,#-4]
+ vmin.u8 d19,d0,d30
+ ldrb r7,[r4,#-1]
+ vmax.u8 d21,d19,d31
+ ldrb r3,[r4,#0]
+ lsl r10,r6,#1
+ ldrb r8,[r4,#3]
+@ ubfx r7,r2,#24,#8 @ has the -1 value
+@ and r2,#0xff @ has the -4 value
+@ ubfx r8,r3,#24,#8 @ has the 3 value
+@ and r3,#0xff @ r4 has the 0 value
+ vaddl.u8 q0,d2,d3
+ vdup.8 d19,r10
+ subs r8,r8,r3
+ vaddw.u8 q0,q0,d4
+ rsbmi r8,r8,#0
+ vqadd.u8 d30,d2,d19
+ subs r2,r2,r7
+ vqsub.u8 d31,d2,d19
+ rsbmi r2,r2,#0
+ vaddl.u8 q13,d5,d6
+ add r8,r8,r2
+ vmla.i16 q13,q0,q8
+ cmp r8,r5,asr #3
+ bge l1.336
+ vrshrn.i16 d26,q13,#3
+ subs r7,r3,r7
+ vqadd.u8 d27,d3,d19
+ rsbmi r7,r7,#0
+ vqsub.u8 d28,d3,d19
+ mov r10,#5
+ vmin.u8 d16,d26,d30
+ mul r10,r10,r6
+ add r10,#1
+ cmp r7,r10,asr #1
+ vmax.u8 d26,d16,d31
+ bge l1.336
+ vqadd.u8 d30,d6,d19
+
+ mov r2,#2
+ ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ vqsub.u8 d31,d6,d19
+ ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+ b end_dep_deq_decision
+@ r2 has the value of de
+@ r6 has teh value of tc
+@ r5 has the value of beta
+@ r14 has the value of dp
+@ r12 has the value of dq
+@ r0 has the value of source address
+@ r1 has the src stride
+
+l1.336:
+ mov r2,#1
+l1.424:
+ mov r11,r5
+ ldr r4,[sp,#0x38] @ loading the filter_flag_p
+ ldr r5,[sp,#0x3c] @ loading the filter_flag_q
+
+ cmp r6,#1
+ moveq r9,#0
+ moveq r10,#0
+ beq end_dep_deq_decision
+
+ and r7,r4,r5
+
+ cmp r7,#1
+ beq both_flags_set
+ cmp r4,#0
+ beq set_flag_dep_zero
+
+
+ add r8,r11,r11,asr #1
+ mov r10,#0
+ asr r8,#3
+ cmp r8,r14
+ movgt r9,#1
+ movle r9,#0
+ b end_dep_deq_decision
+set_flag_dep_zero:
+
+ add r8,r11,r11,asr #1
+ mov r9,#0
+ asr r8,#3
+ cmp r8,r12
+ movgt r10,#1
+ movle r10,#0
+ b end_dep_deq_decision
+
+both_flags_set:
+ add r8,r11,r11,asr #1
+ asr r8,#3
+ cmp r8,r14
+ movgt r9,#1
+ movle r9,#0
+ cmp r8,r12
+ movgt r10,#1
+ movle r10,#0
+end_dep_deq_decision:
+
+@r0=source address
+@r1=stride
+@ r2 =de
+@ r4=flag p
+@r5= flag q
+@r6 =tc
+@ r9 =dep
+@ r10=deq
+@ b l1.964
+
+
+ cmp r2,#2
+@ r4 has the value of de
+ bne l1.968
+
+ cmp r5,#0
+ beq l1.780
+@ r5 has the flag of q
+
+ add r3,r0,#2
+ vst1.8 {d22[0]},[r3],r1
+
+ vst1.8 {d22[1]},[r3],r1
+
+ vst1.8 {d22[2]},[r3],r1
+
+ vst1.8 {d22[3]},[r3]
+ add r3,r0,r1
+ vtrn.8 d20,d21
+
+ vst1.16 {d20[0]},[r0]
+ vst1.16 {d21[0]},[r3],r1
+ vst1.16 {d20[1]},[r3],r1
+ vst1.16 {d21[1]},[r3]
+
+
+l1.780:
+ cmp r4,#0
+ beq l1.964
+ @ r5 has the flag p
+
+
+ vdup.32 d7,d24[0]
+ sub r3,r0,#1
+ vaddw.u8 q8,q0,d6
+ add r7,r3,r1
+ vrshrn.i16 d2,q8,#2
+ vst1.8 {d26[0]},[r3]
+ sub r0,r0,#3
+ vmin.u8 d16,d2,d27
+ vst1.8 {d26[1]},[r7],r1
+ vmull.u8 q1,d6,d23
+ vmlal.u8 q1,d7,d18
+ vst1.8 {d26[2]},[r7],r1
+ vmax.u8 d5,d16,d28
+ vst1.8 {d26[3]},[r7]
+ vadd.i16 q0,q1,q0
+ vrshrn.i16 d0,q0,#3
+
+
+ vmin.u8 d1,d0,d30
+ vmax.u8 d0,d1,d31
+
+ vtrn.8 d0,d5
+ vst1.16 {d0[0]},[r0],r1
+ vst1.16 {d5[0]},[r0],r1
+ vst1.16 {d0[1]},[r0],r1
+ vst1.16 {d5[1]},[r0]
+l1.964:
+ pop {r3-r12,pc}
+l1.968:
+
+
+ vmov.i16 q0,#0x9
+ rsb r11,r6,#0
+ cmp r4,#0
+ @ checks for the flag p
+ vmov.i16 q8,#0x3
+ vmov.i8 d24,#0x1
+
+
+ vdup.8 d30,r11
+ and r11,r6,#0xff
+ vdup.8 d31,r11
+
+ vsubl.u8 q9,d4,d2
+ vmul.i16 q9,q9,q0
+ vsubl.u8 q0,d5,d3
+
+
+
+ vmul.i16 q8,q0,q8
+ vsub.i16 q8,q9,q8
+ vrshr.s16 q8,q8,#4
+@ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
+
+ vabs.s16 q0,q8
+ vmovn.i16 d0,q0
+ @ storing the absolute values of delta in d0
+
+ vqmovn.s16 d16,q8
+ @ storing the clipped values of delta in d16
+
+ vmov.i8 d1,#0xa
+ vdup.8 d21,r11
+ vmul.i8 d1,d1,d21
+ @ d1 stores the value (10 * tc)
+
+@if(abs(delta) < 10 * tc)
+
+ vmin.s8 d18,d16,d31
+ vmax.s8 d20,d18,d30
+
+@ delta = clip3(delta, -tc, tc)@
+ vmovl.s8 q8,d20
+ vmovl.u8 q9,d2
+ vadd.i16 q9,q9,q8
+
+ vqmovun.s16 d22,q9
+ vmovl.u8 q9,d4
+ vsub.i16 q8,q9,q8
+ vqmovun.s16 d23,q8
+@ tmp_p0 = clip_u8(pu1_src[-1] + delta)@
+@ tmp_q0 = clip_u8(pu1_src[0] - delta)@
+ beq l1.1272
+
+
+
+ cmp r9,#1
+ bne l1.1212
+@ checks for the flag dep
+
+ asr r3,r6,#1
+
+
+ vaddl.u8 q8,d6,d2
+ vaddw.u8 q8,q8,d24
+ vdup.8 d18,r3
+ rsb r3,r3,#0
+ vdup.8 d19,r3
+ vshr.u16 q8,q8,#1
+ vmovn.i16 d16,q8
+
+ vsubl.u8 q8,d16,d3
+ vaddw.s8 q8,q8,d20
+ vshr.s16 q8,q8,#1
+ vqmovn.s16 d16,q8
+
+ vmin.s8 d17,d16,d18
+ vmax.s8 d16,d19,d17
+
+
+
+
+ vmovl.u8 q9,d3
+ vmovl.s8 q8,d16
+ vadd.i16 q8,q9,q8
+
+ vqmovun.s16 d16,q8
+ vmov d30,d3
+ vcge.u8 d3,d0,d1
+
+
+ vbsl d3,d30,d16
+l1.1212:
+ vdup.8 d16,r11
+ sub r12,r0,#3
+ sub r3,r0,#1
+@ vmul.i8 d16,d16,d1
+ vtrn.8 d6,d3
+ vst1.16 {d6[0]},[r12],r1
+ vcge.u8 d16,d0,d1
+ vst1.16 {d3[0]},[r12],r1
+ vbsl d16,d2,d22
+ vst1.8 {d16[0]},[r3],r1
+ vst1.8 {d16[1]},[r3],r1
+ vst1.16 {d6[1]},[r12],r1
+ vst1.8 {d16[2]},[r3],r1
+ vst1.16 {d3[1]},[r12]
+ vst1.8 {d16[3]},[r3]
+l1.1272:
+ @ ldr r3,[sp,#0x38]
+ cmp r5,#0
+ beq l1.964
+ @ checks for the flag q
+ cmp r10,#1
+ bne l1.1412
+ @ checks for the flag deq
+ vmov d2,d7
+ asr r3,r6,#1
+
+ vdup.8 d6,r3
+ rsb r3,r3,#0
+ vdup.8 d16,r3
+ vaddl.u8 q1,d2,d4
+ vaddw.u8 q1,q1,d24
+ vshr.u16 q1,q1,#1
+ vmovn.i16 d2,q1
+
+ vsubl.u8 q1,d2,d5
+ vsubw.s8 q1,q1,d20
+ vshr.s16 q1,q1,#1
+ vqmovn.s16 d3,q1
+
+ vmin.s8 d2,d3,d6
+ vmax.s8 d3,d16,d2
+ @ vdup.8 d6,r2
+ @ vmul.i8 d6,d6,d1
+
+
+
+ vmovl.u8 q8,d5
+ vmovl.s8 q1,d3
+ vadd.i16 q1,q8,q1
+ vqmovun.s16 d3,q1
+ vmov d30,d5
+ vcge.u8 d5,d0,d1
+
+
+ vbsl d5,d30,d3
+l1.1412:
+ @ vdup.8 d2,r2
+ add r3,r0,#2
+ add r11,r3,r1
+ @ vmul.i8 d1,d2,d1
+ vst1.8 {d7[0]},[r3]
+ vst1.8 {d7[1]},[r11],r1
+ vst1.8 {d7[2]},[r11],r1
+ vcge.u8 d0,d0,d1
+ vst1.8 {d7[3]},[r11]
+ vbsl d0,d4,d23
+ vtrn.8 d0,d5
+ vst1.16 {d0[0]},[r0],r1
+ vst1.16 {d5[0]},[r0],r1
+ vst1.16 {d0[1]},[r0],r1
+ vst1.16 {d5[1]},[r0]
+ pop {r3-r12,pc}
+
+
+
diff --git a/common/arm/ihevc_func_selector.h b/common/arm/ihevc_func_selector.h
new file mode 100644
index 0000000..8188178
--- /dev/null
+++ b/common/arm/ihevc_func_selector.h
@@ -0,0 +1,227 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_func_selector.h
+*
+* @brief
+* For each function decide whether to use C function, or Neon intrinsics
+* or Cortex A8 intrinsics or Neon assembly or cortex a8 assembly
+*
+* @author
+* Harish
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef __IHEVC_FUNC_SELECTOR_H__
+#define __IHEVC_FUNC_SELECTOR_H__
+
+#include "ihevc_func_types.h"
+
+#define INTER_PRED_LUMA_COPY C
+#define INTER_PRED_LUMA_HORZ C
+#define INTER_PRED_LUMA_VERT C
+#define INTER_PRED_LUMA_COPY_W16OUT C
+#define INTER_PRED_LUMA_HORZ_W16OUT C
+
+#define INTER_PRED_LUMA_VERT_W16OUT C
+#define INTER_PRED_LUMA_VERT_W16INP C
+#define INTER_PRED_LUMA_VERT_W16INP_W16OUT C
+
+#define INTER_PRED_CHROMA_COPY C
+#define INTER_PRED_CHROMA_HORZ C
+#define INTER_PRED_CHROMA_VERT C
+#define INTER_PRED_CHROMA_COPY_W16OUT C
+#define INTER_PRED_CHROMA_HORZ_W16OUT C
+#define INTER_PRED_CHROMA_VERT_W16OUT C
+#define INTER_PRED_CHROMA_VERT_W16INP C
+#define INTER_PRED_CHROMA_VERT_W16INP_W16OUT C
+
+#define WEIGHTED_PRED_UNI C
+#define WEIGHTED_PRED_BI C
+#define WEIGHTED_PRED_BI_DEFAULT C
+#define WEIGHTED_PRED_CHROMA_UNI C
+#define WEIGHTED_PRED_CHROMA_BI C
+#define WEIGHTED_PRED_CHROMA_BI_DEFAULT C
+
+#define PAD_VERT C
+#define PAD_HORZ C
+#define PAD_LEFT_LUMA C
+#define PAD_LEFT_CHROMA C
+#define PAD_RIGHT_LUMA C
+#define PAD_RIGHT_CHROMA C
+
+#define DEBLOCKING_ASM C
+#define DEBLK_LUMA_HORZ C
+#define DEBLK_LUMA_VERT C
+#define DEBLK_CHROMA_HORZ C
+#define DEBLK_CHROMA_VERT C
+
+#define SAO_BAND_OFFSET_LUMA C
+#define SAO_BAND_OFFSET_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS0_LUMA C
+#define SAO_EDGE_OFFSET_CLASS1_LUMA C
+#define SAO_EDGE_OFFSET_CLASS2_LUMA C
+#define SAO_EDGE_OFFSET_CLASS3_LUMA C
+#define SAO_EDGE_OFFSET_CLASS0_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS1_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS2_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS3_CHROMA C
+
+#define INTRA_PRED_LUMA_REF_SUBSTITUTION C
+#define INTRA_PRED_REF_FILTERING C
+#define INTRA_PRED_LUMA_PLANAR C
+#define INTRA_PRED_LUMA_DC C
+#define INTRA_PRED_LUMA_HORZ C
+#define INTRA_PRED_LUMA_VER C
+#define INTRA_PRED_LUMA_MODE_2 C
+#define INTRA_PRED_LUMA_MODE_18_34 C
+#define INTRA_PRED_LUMA_MODE_3_T0_9 C
+#define INTRA_PRED_LUMA_MODE_11_T0_17 C
+#define INTRA_PRED_LUMA_MODE_19_T0_25 C
+#define INTRA_PRED_LUMA_MODE_27_T0_33 C
+
+#define INTRA_PRED_CHROMA_PLANAR C
+#define INTRA_PRED_CHROMA_DC C
+#define INTRA_PRED_CHROMA_HOR C
+#define INTRA_PRED_CHROMA_VER C
+#define INTRA_PRED_CHROMA_MODE_2 C
+#define INTRA_PRED_CHROMA_18_34 C
+#define INTRA_PRED_CHROMA_3_T0_9 C
+#define INTRA_PRED_CHROMA_11_T0_17 C
+#define INTRA_PRED_CHROMA_19_T0_25 C
+#define INTRA_PRED_CHROMA_27_T0_33 C
+#define INTRA_PRED_CHROMA_REF_SUBSTITUTION C
+
+/* Forward transform functions */
+/* Luma */
+#define RESI_TRANS_QUANT_4X4_TTYPE1 C
+#define RESI_TRANS_QUANT_4X4 C
+#define RESI_TRANS_QUANT_8X8 C
+#define RESI_TRANS_QUANT_16X16 C
+#define RESI_TRANS_QUANT_32X32 C
+
+#define RESI_QUANT_4X4_TTYPE1 C
+#define RESI_QUANT_4X4 C
+#define RESI_QUANT_8X8 C
+#define RESI_QUANT_16X16 C
+#define RESI_QUANT_32X32 C
+
+#define RESI_TRANS_4X4_TTYPE1 C
+#define RESI_TRANS_4X4 C
+#define RESI_TRANS_8X8 C
+#define RESI_TRANS_16X16 C
+#define RESI_TRANS_32X32 C
+
+#define RESI_4X4_TTYPE1 C
+#define RESI_4X4 C
+#define RESI_8X8 C
+#define RESI_16X16 C
+#define RESI_32X32 C
+
+#define TRANS_4X4_TTYPE1 C
+#define TRANS_4X4 C
+#define TRANS_8X8 C
+#define TRANS_16X16 C
+#define TRANS_32X32 C
+
+#define QUANT_4X4_TTYPE1 C
+#define QUANT_4X4 C
+#define QUANT_8X8 C
+#define QUANT_16X16 C
+#define QUANT_32X32 C
+
+/* Chroma interleaved*/
+#define CHROMA_RESI_TRANS_QUANT_4X4 C
+#define CHROMA_RESI_TRANS_QUANT_8X8 C
+#define CHROMA_RESI_TRANS_QUANT_16X16 C
+
+#define CHROMA_RESI_QUANT_4X4 C
+#define CHROMA_RESI_QUANT_8X8 C
+#define CHROMA_RESI_QUANT_16X16 C
+
+#define CHROMA_RESI_TRANS_4X4 C
+#define CHROMA_RESI_TRANS_8X8 C
+#define CHROMA_RESI_TRANS_16X16 C
+
+#define CHROMA_RESI_4X4 C
+#define CHROMA_RESI_8X8 C
+#define CHROMA_RESI_16X16 C
+
+/* Inverse transform functions */
+/* Luma */
+#define IQUANT_ITRANS_RECON_4X4_TTYPE1 C
+#define IQUANT_ITRANS_RECON_4X4 C
+#define IQUANT_ITRANS_RECON_8X8 C
+#define IQUANT_ITRANS_RECON_16X16 C
+#define IQUANT_ITRANS_RECON_32X32 C
+
+#define IQUANT_RECON_4X4_TTYPE1 C
+#define IQUANT_RECON_4X4 C
+#define IQUANT_RECON_8X8 C
+#define IQUANT_RECON_16X16 C
+#define IQUANT_RECON_32X32 C
+
+#define ITRANS_RECON_4X4_TTYPE1 C
+#define ITRANS_RECON_4X4 C
+#define ITRANS_RECON_8X8 C
+#define ITRANS_RECON_16X16 C
+#define ITRANS_RECON_32X32 C
+
+#define RECON_4X4_TTYPE1 C
+#define RECON_4X4 C
+#define RECON_8X8 C
+#define RECON_16X16 C
+#define RECON_32X32 C
+
+#define ITRANS_4X4_TTYPE1 C
+#define ITRANS_4X4 C
+#define ITRANS_8X8 C
+#define ITRANS_16X16 C
+#define ITRANS_32X32 C
+
+/* Chroma interleaved */
+#define CHROMA_IQUANT_ITRANS_RECON_4X4 C
+#define CHROMA_IQUANT_ITRANS_RECON_8X8 C
+#define CHROMA_IQUANT_ITRANS_RECON_16X16 C
+
+#define CHROMA_IQUANT_RECON_4X4 C
+#define CHROMA_IQUANT_RECON_8X8 C
+#define CHROMA_IQUANT_RECON_16X16 C
+
+#define CHROMA_ITRANS_RECON_4X4 C
+#define CHROMA_ITRANS_RECON_8X8 C
+#define CHROMA_ITRANS_RECON_16X16 C
+
+#define CHROMA_RECON_4X4 C
+#define CHROMA_RECON_8X8 C
+#define CHROMA_RECON_16X16 C
+
+#define IHEVC_MEMCPY C
+#define IHEVC_MEMSET C
+#define IHEVC_MEMSET_16BIT C
+#define IHEVC_MEMCPY_MUL_8 C
+#define IHEVC_MEMSET_MUL_8 C
+#define IHEVC_MEMSET_16BIT_MUL_8 C
+
+#endif /* __IHEVC_FUNC_SELECTOR_H__ */
diff --git a/common/arm/ihevc_inter_pred_chroma_copy.s b/common/arm/ihevc_inter_pred_chroma_copy.s
new file mode 100644
index 0000000..0da34cc
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_copy.s
@@ -0,0 +1,270 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_copy_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma interprediction filter for copy
+@*
+@* @par description:
+@* copies the array of width 'wd' and height 'ht' from the location pointed
+@* by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => *pi1_coeff
+@ r5 => ht
+@ r6 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_copy_a9q
+
+.type ihevc_inter_pred_chroma_copy_a9q, %function
+
+ihevc_inter_pred_chroma_copy_a9q:
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r12,[sp,#48] @loads wd
+ lsl r12,r12,#1
+ ldr r7,[sp,#44] @loads ht
+ cmp r7,#0 @checks ht == 0
+ ble end_loops
+ and r8,r7,#3 @check ht for mul of 2
+ sub r7,r7,r8 @check the rounded height value
+ tst r12,#15 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_16
+ tst r12,#7 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_8
+
+ sub r11,r12,#4
+ cmp r7,#0
+ beq outer_loop_wd_4_ht_2
+
+outer_loop_wd_4:
+ subs r4,r12,#0 @checks wd == 0
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r0,r0,#4 @pu1_src += 4
+ vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ subs r4,r4,#4 @(wd -4)
+ vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r1,r1,#4 @pu1_dst += 4
+ vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r7,r7,#4 @ht - 4
+ sub r0,r5,r11 @pu1_src = pu1_src_tmp
+ sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_4
+ cmp r8,#0
+ bgt outer_loop_wd_4_ht_2
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+outer_loop_wd_4_ht_2:
+ subs r4,r12,#0 @checks wd == 0
+ ble end_loops
+
+inner_loop_wd_4_ht_2:
+ vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r0,r0,#4 @pu1_src += 4
+ vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ subs r4,r4,#4 @(wd -4)
+ add r1,r1,#4 @pu1_dst += 4
+ bgt inner_loop_wd_4_ht_2
+ b end_loops
+
+core_loop_wd_8:
+ sub r11,r12,#8
+ cmp r7,#0
+ beq outer_loop_wd_8_ht_2
+
+outer_loop_wd_8:
+ subs r4,r12,#0 @checks wd
+ ble end_inner_loop_wd_8
+
+inner_loop_wd_8:
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4,r4,#8 @wd - 8(loop condition)
+ vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_8
+
+end_inner_loop_wd_8:
+ subs r7,r7,#4 @ht -= 4
+ sub r0,r5,r11 @pu1_src = pu1_src_tmp
+ sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_8
+ cmp r8,#0
+ bgt outer_loop_wd_8_ht_2
+ b end_loops
+
+outer_loop_wd_8_ht_2:
+ subs r4,r12,#0 @checks wd
+ ble end_loops
+
+inner_loop_wd_8_ht_2:
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ @subs r4,r4,#8 @wd - 8(loop condition)
+ @bgt inner_loop_wd_8_ht_2
+ b end_loops
+
+core_loop_wd_16:
+ sub r11,r12,#16
+ cmp r7,#0
+ beq outer_loop_wd_16_ht_2
+
+outer_loop_wd_16:
+ subs r4,r12,#0 @checks wd
+ ble end_inner_loop_wd_16
+
+inner_loop_wd_16:
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4,r4,#16 @wd - 16(loop condition)
+ vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_16
+
+end_inner_loop_wd_16:
+ subs r7,r7,#4 @ht -= 4
+ sub r0,r5,r11 @pu1_src = pu1_src_tmp
+ sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_16
+ cmp r8,#0
+ bgt outer_loop_wd_16_ht_2
+ b end_loops
+
+outer_loop_wd_16_ht_2:
+ subs r4,r12,#0 @checks wd
+ ble end_loops
+
+inner_loop_wd_16_ht_2:
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ @subs r4,r4,#16 @wd - 16(loop condition)
+ @bgt inner_loop_wd_16_ht_2
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
new file mode 100644
index 0000000..a927fa7
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s
@@ -0,0 +1,325 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_copy_w16out_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma interprediction filter for copy
+@*
+@* @par description:
+@* copies the array of width 'wd' and height 'ht' from the location pointed
+@* by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 => src_strd
+@r3 => dst_strd
+@r4 => *pi1_coeff
+@r5 => ht
+@r6 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_copy_w16out_a9q
+
+.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_copy_w16out_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r12,[sp,#48] @loads wd
+ lsl r12,r12,#1 @2*wd
+ ldr r7,[sp,#44] @loads ht
+ cmp r7,#0 @ht condition(ht == 0)
+ ble end_loops @loop
+ and r8,r7,#3 @check ht for mul of 2
+ sub r9,r7,r8 @check the rounded height value
+ and r11,r7,#6
+ cmp r11,#6
+ beq loop_ht_6
+ tst r12,#7 @conditional check for wd (multiples)
+ beq core_loop_wd_8
+
+loop_ht_6:
+ sub r11,r12,#4
+ lsls r6,r3,#1
+ cmp r9,#0
+ beq outer_loop_wd_4_ht_2
+
+outer_loop_wd_4:
+ subs r4,r12,#0 @wd conditional subtract
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp)
+ add r5,r0,r2 @pu1_src +src_strd
+ vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ add r10,r1,r6
+ subs r4,r4,#4 @wd - 4
+ vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6)
+ vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp)
+ add r0,r0,#4 @pu1_src += 4
+ vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ add r1,r1,#8
+ vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6)
+ vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6)
+ vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6)
+ vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r9,r9,#4 @ht - 4
+ sub r0,r5,r11
+ sub r1,r10,r11,lsl #1
+ bgt outer_loop_wd_4
+ cmp r8,#0
+ bgt outer_loop_wd_4_ht_2
+
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+outer_loop_wd_4_ht_2:
+ subs r4,r12,#0 @wd conditional subtract
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4_ht_2:
+ vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp)
+ add r5,r0,r2 @pu1_src +src_strd
+ vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ add r10,r1,r6
+ subs r4,r4,#4 @wd - 4
+ vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6)
+ vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp)
+ add r0,r0,#4 @pu1_src += 4
+ vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ add r1,r1,#8
+ vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6)
+ vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ bgt inner_loop_wd_4_ht_2
+ b end_loops
+
+
+core_loop_wd_8:
+ @sub r11,r12,#8
+ lsls r5,r3,#1
+ rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width
+ rsb r8,r12,r2,lsl #2 @r2->src_strd
+ mov r4,r12, lsr #3 @ divide by 8
+ mov r7,r9
+ mul r7, r4
+ sub r4,r12,#0 @wd conditional check
+ sub r7,r7,#4 @subtract one for epilog
+ cmp r9,#0
+ beq core_loop_wd_8_ht_2
+
+prolog:
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+ add r10,r1,r5
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ subs r4,r4,#8 @wd decrements by 8
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+ addle r0,r0,r8
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+ addle r1,r1,r11,lsl #1
+ suble r4,r12,#0 @wd conditional check
+
+ subs r7,r7,#4 @ht - 4
+
+ blt epilog_end @jumps to epilog_end
+ beq epilog @jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ subs r4,r4,#8 @wd decrements by 8
+ addle r0,r0,r8
+
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+ add r10,r1,r5
+
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+
+ addle r1,r1,r11,lsl #1
+ suble r4,r12,#0 @wd conditional check
+
+ subs r7,r7,#4 @ht - 4
+ bgt outer_loop_wd_8
+
+epilog:
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ @add r6,r0,r2 @pu1_src_tmp += src_strd
+
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+ add r10,r1,r5
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ b end_loops
+
+core_loop_wd_8_ht_2:
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+ add r10,r1,r5
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ subs r12,r12,#8 @wd decrements by 8
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ bgt core_loop_wd_8_ht_2
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_chroma_horz.s b/common/arm/ihevc_inter_pred_chroma_horz.s
new file mode 100644
index 0000000..fbd1be1
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_horz.s
@@ -0,0 +1,684 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_horz_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs / akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma interprediction filter for horizontal input
+@*
+@* @par description:
+@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+@* to the elements pointed by 'pu1_src' and writes to the location pointed
+@* by 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
+@* assumptions : the function is optimized considering the fact width is
+@* multiple of 2,4 or 8. if width is 2, then height should be multiple of 2.
+@* width 4,8 is optimized further
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 => src_strd
+@r3 => dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_horz_a9q
+
+.type ihevc_inter_pred_chroma_horz_a9q, %function
+
+ihevc_inter_pred_chroma_horz_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads pi1_coeff
+ ldr r7,[sp,#44] @loads ht
+ ldr r10,[sp,#48] @loads wd
+
+ vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
+ subs r14,r7,#0 @checks for ht == 0
+ vabs.s8 d2,d0 @vabs_s8(coeff)
+ mov r11,#2
+ ble end_loops
+
+ vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub r12,r0,#2 @pu1_src - 2
+ vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+ tst r10,#3 @checks wd for multiples
+ mov r5,r10,lsl #1
+
+ vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+ bne outer_loop_4
+ cmp r10,#12
+ beq skip_16
+
+ cmp r10,#8
+ bge outer_loop_16
+skip_16:
+ tst r7,#3
+
+ sub r9,r0,#2
+ beq outer_loop_ht_4 @jumps to else condition
+
+ b outer_loop_8
+
+
+outer_loop_16:
+ mov r10,r5 @2wd
+ mul r14,r14,r10
+
+ rsb r6,r3,#16
+
+ add r4,r12,r2
+ mov r9,#10
+ and r0, r12, #31
+ rsb r8,r5,r3,lsl #1
+ pld [r12, r2, lsl #1]
+
+
+
+
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ pld [r4, r2, lsl #1]
+ vld1.u32 {q1},[r12],r11 @vector load pu1_src
+
+ vld1.u32 {q2},[r12],r11 @vector load pu1_src
+
+ vld1.u32 {q3},[r12],r9 @vector load pu1_src
+
+
+ vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q4},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {q5},[r4],r11 @vector load pu1_src
+ vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q6},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q7},[r4],r9 @vector load pu1_src
+ vmull.u8 q14,d3,d25
+
+ vmlsl.u8 q14,d1,d24
+
+
+ vmlal.u8 q14,d5,d26
+
+ vmlsl.u8 q14,d7,d27
+
+
+ cmp r14,#32
+ beq epilog_end
+ sub r14,#64
+
+inner_loop_16:
+
+
+
+
+@ bgt l_2
+
+@ pld [r12, r2, lsl #1]
+@ pld [r4, r2, lsl #1]
+
+
+
+ subs r10,r10,#16
+
+ vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+ addeq r12,r12,r8
+ addeq r4,r12,r2
+ vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+
+
+ pld [r12, r2, lsl #2]
+ vqrshrun.s16 d30,q15,#6
+
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ vqrshrun.s16 d31,q14,#6
+
+
+ vld1.u32 {q1},[r12],r11 @vector load pu1_src
+ vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+
+
+
+ vld1.u32 {q2},[r12],r11 @vector load pu1_src
+ vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+ vld1.u32 {q3},[r12],r9 @vector load pu1_src
+ vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ pld [r4, r2, lsl #2]
+ vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vst1.16 {q15}, [r1],r3
+ vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {q4},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+ vld1.u32 {q5},[r4],r11 @vector load pu1_src
+ vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {q6},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {q7},[r4],r9 @vector load pu1_src
+ vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ cmp r10,#0
+ vqrshrun.s16 d22,q11,#6
+ vqrshrun.s16 d23,q10,#6
+
+
+
+ vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ moveq r10,r5 @2wd
+ vmull.u8 q14,d3,d25
+
+
+ vst1.16 {q11},[r1],r6 @store the result pu1_dst
+ vmlsl.u8 q14,d1,d24
+
+
+ addeq r1,r1,r8
+ vmlal.u8 q14,d5,d26
+
+ subs r14,r14,#32 @decrement the ht loop
+ vmlsl.u8 q14,d7,d27
+
+@ mov r0, r7
+
+ bgt inner_loop_16
+
+
+
+ add r14,r14,#64
+ cmp r14,#32
+ beq epilog_end
+
+epilog:
+ vqrshrun.s16 d30,q15,#6
+ vqrshrun.s16 d31,q14,#6
+
+
+
+ vst1.16 {q15}, [r1],r3
+ vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+
+
+ vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ subs r10,r10,#16 @decrement the wd loop
+ vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ addeq r12,r12,r8
+ vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ moveq r10,r5 @2wd
+
+
+ addeq r4,r12,r2
+ vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {q1},[r12],r11 @vector load pu1_src
+ vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q2},[r12],r11 @vector load pu1_src
+ vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q3},[r12],r9 @vector load pu1_src
+ vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+ vld1.u32 {q4},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {q5},[r4],r11 @vector load pu1_src
+ vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vld1.u32 {q6},[r4],r11 @vector load pu1_src
+ vmull.u8 q14,d3,d25
+ vld1.u32 {q7},[r4],r9 @vector load pu1_src
+ vmlsl.u8 q14,d1,d24
+ vqrshrun.s16 d22,q11,#6
+ vqrshrun.s16 d23,q10,#6
+
+ vst1.16 {q11},[r1],r6 @store the result pu1_dst
+ vmlal.u8 q14,d5,d26
+
+ vmlsl.u8 q14,d7,d27
+ addeq r1,r1,r8
+
+
+
+epilog_end:
+ vqrshrun.s16 d30,q15,#6
+ vqrshrun.s16 d31,q14,#6
+
+
+ vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+ vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vqrshrun.s16 d22,q11,#6
+ vqrshrun.s16 d23,q10,#6
+
+
+ vst1.16 {q15}, [r1],r3
+
+ vst1.16 {q11},[r1] @store the result pu1_dst
+
+
+
+ b end_loops
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+outer_loop_8:
+
+
+ add r6,r1,r3 @pu1_dst + dst_strd
+ mov r7,r5
+ add r4,r12,r2 @pu1_src + src_strd
+
+
+inner_loop_8:
+ @vld1.u32 {d0,d1},[r12],r11 @vector load pu1_src
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11 @vector load pu1_src
+ vld1.u32 {d2},[r12],r11 @vector load pu1_src
+ vld1.u32 {d3},[r12],r11 @vector load pu1_src
+
+ @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ vmull.u8 q4,d1,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+ @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+ vmlal.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q4,d3,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vld1.u32 {d4},[r4],r11 @vector load pu1_src
+ vld1.u32 {d5},[r4],r11 @vector load pu1_src
+ vld1.u32 {d6},[r4],r11 @vector load pu1_src
+ vld1.u32 {d7},[r4],r11 @vector load pu1_src
+ @vld1.u32 {d12,d13},[r4],r11 @vector load pu1_src + src_strd
+ @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
+ vmull.u8 q5,d5,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q5,d4,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+ vqrshrun.s16 d8,q4,#6 @right shift and saturating narrow result 1
+ vmlal.u8 q5,d6,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q5,d7,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.8 {d8},[r1]! @store the result pu1_dst
+
+ vqrshrun.s16 d10,q5,#6 @right shift and saturating narrow result 2
+ subs r7,r7,#8 @decrement the wd loop
+ vst1.8 {d10},[r6]! @store the result pu1_dst
+ bgt inner_loop_8
+
+ sub r12,r12,r5
+ subs r14,r14,#2 @decrement the ht loop
+ sub r1,r1,r5
+ add r12,r12,r2,lsl #1
+ add r1,r1,r3,lsl #1
+ bgt outer_loop_8
+ b end_loops
+
+@height if 4 comes
+outer_loop_ht_4:
+
+ mov r7,r5
+
+prologue_ht_4:
+
+inner_loop_ht_4:
+
+ mov r12,r9
+ mov r4,r1
+
+ sub r8, r2, #6
+
+ vld1.u32 {d0},[r12],r11 @(1)vector load pu1_src
+ vld1.u32 {d1},[r12],r11 @(1)vector load pu1_src
+ vld1.u32 {d2},[r12],r11 @(1)vector load pu1_src
+ @vld1.u32 {d3},[r12],r2 @(1)vector load pu1_src
+ vld1.u32 {d3},[r12],r8 @(1)vector load pu1_src
+
+ @sub r12, r12, #6 @(1)
+
+ vld1.u32 {d4},[r12],r11 @(2)vector load pu1_src
+ vld1.u32 {d5},[r12],r11 @(2)vector load pu1_src
+ vld1.u32 {d6},[r12],r11 @(2)vector load pu1_src
+ @vld1.u32 {d7},[r12],r2 @(2)vector load pu1_src
+ vld1.u32 {d7},[r12],r8 @(2)vector load pu1_src
+
+ @sub r12, r12, #6 @(2)
+
+ vld1.u32 {d14},[r12],r11 @(3)vector load pu1_src
+ vmull.u8 q4,d1,d25 @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d15},[r12],r11 @(3)vector load pu1_src
+ vmlsl.u8 q4,d0,d24 @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d16},[r12],r11 @(3)vector load pu1_src
+ vmlal.u8 q4,d2,d26 @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ @vld1.u32 {d17},[r12],r2 @(3)vector load pu1_src
+ vld1.u32 {d17},[r12],r8 @(3)vector load pu1_src
+ vmlsl.u8 q4,d3,d27 @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ @sub r12, r12, #6 @(3)
+ vmull.u8 q5,d5,d25 @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d18},[r12],r11 @(4)vector load pu1_src
+ vmlsl.u8 q5,d4,d24 @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d19},[r12],r11 @(4)vector load pu1_src
+ vmlal.u8 q5,d6,d26 @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {d20},[r12],r11 @(4)vector load pu1_src
+ vmlsl.u8 q5,d7,d27 @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vld1.u32 {d21},[r12],r2 @(4)vector load pu1_src
+ vqrshrun.s16 d8,q4,#6 @(1)right shift and saturating narrow result 1
+
+ add r9,r9,#8 @(core loop)
+
+ subs r7,r7,#8 @(prologue)decrement the wd loop
+ beq epilogue
+
+core_loop:
+ mov r12,r9
+
+ vld1.u32 {d0},[r12],r11 @(1_1)vector load pu1_src
+ vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d1},[r12],r11 @(1_1)vector load pu1_src
+ vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d2},[r12],r11 @(1_1)vector load pu1_src
+ vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ @vld1.u32 {d3},[r12],r2 @(1_1)vector load pu1_src
+ vld1.u32 {d3},[r12],r8 @(1_1)vector load pu1_src
+ vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ @sub r12, r12, #6 @(1_1)
+
+ vst1.8 {d8},[r4],r3 @(1)store the result pu1_dst
+ vqrshrun.s16 d10,q5,#6 @(2)right shift and saturating narrow result 2
+
+ vld1.u32 {d4},[r12],r11 @(2_1)vector load pu1_src
+ vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d5},[r12],r11 @(2_1)vector load pu1_src
+ vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d6},[r12],r11 @(2_1)vector load pu1_src
+ vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ @vld1.u32 {d7},[r12],r2 @(2_1)vector load pu1_src
+ vld1.u32 {d7},[r12],r8 @(2_1)vector load pu1_src
+ vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ @sub r12, r12, #6 @(2_1)
+
+ vst1.8 {d10},[r4],r3 @(2)store the result pu1_dst
+ vqrshrun.s16 d12,q6,#6 @(3)right shift and saturating narrow result 1
+
+ vld1.u32 {d14},[r12],r11 @(3_1)vector load pu1_src
+ vmull.u8 q4,d1,d25 @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d15},[r12],r11 @(3_1)vector load pu1_src
+ vmlsl.u8 q4,d0,d24 @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d16},[r12],r11 @(3_1)vector load pu1_src
+ vmlal.u8 q4,d2,d26 @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ @vld1.u32 {d17},[r12],r2 @(3_1)vector load pu1_src
+ vld1.u32 {d17},[r12],r8 @(3_1)vector load pu1_src
+ vmlsl.u8 q4,d3,d27 @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ @sub r12, r12, #6 @(3_1)
+
+ vst1.8 {d12},[r4],r3 @(3)store the result pu1_dst
+ vqrshrun.s16 d22,q11,#6 @(4)right shift and saturating narrow result 2
+
+ add r9,r9,#8 @(core loop)
+
+ vmull.u8 q5,d5,d25 @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {d18},[r12],r11 @(4_1)vector load pu1_src
+
+ vld1.u32 {d19},[r12],r11 @(4_1)vector load pu1_src
+ vmlsl.u8 q5,d4,d24 @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d20},[r12],r11 @(4_1)vector load pu1_src
+ vmlal.u8 q5,d6,d26 @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {d21},[r12],r2 @(4_1)vector load pu1_src
+ vmlsl.u8 q5,d7,d27 @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ add r1,r1,#8 @(core loop)
+
+ subs r7,r7,#8 @(core loop)
+
+ vst1.8 {d22}, [r4], r3 @(4)store the result pu1_dst
+ vqrshrun.s16 d8,q4,#6 @(1_1)right shift and saturating narrow result 1
+
+ mov r4, r1 @(core loop)
+
+ bgt core_loop @loopback
+
+epilogue:
+ vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.8 {d8},[r4],r3 @(1)store the result pu1_dst
+ vqrshrun.s16 d10,q5,#6 @(2)right shift and saturating narrow result 2
+
+ vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.8 {d10},[r4],r3 @(2)store the result pu1_dst
+ vqrshrun.s16 d12,q6,#6 @(3)right shift and saturating narrow result 1
+
+ vst1.8 {d12},[r4],r3 @(3)store the result pu1_dst
+
+ add r1,r1,#8 @(core loop)
+
+ vqrshrun.s16 d22,q11,#6 @(4)right shift and saturating narrow result 2
+
+
+ vst1.8 {d22}, [r4], r3 @(4)store the result pu1_dst
+
+ sub r9,r9,r5
+ subs r14,r14,#4 @decrement the ht loop
+ sub r1,r1,r5
+ add r9,r9,r2,lsl #2
+ add r1,r1,r3,lsl #2
+ bgt outer_loop_ht_4
+ b end_loops
+
+outer_loop_4:
+ add r6,r1,r3 @pu1_dst + dst_strd
+ mov r7,r5
+ add r4,r12,r2 @pu1_src + src_strd
+
+inner_loop_4:
+ @vld1.u32 {d0,d1},[r12] @vector load pu1_src
+
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11 @vector load pu1_src
+ vld1.u32 {d2},[r12],r11 @vector load pu1_src
+ vld1.u32 {d3},[r12] @vector load pu1_src
+
+ sub r12,r12,#2 @increment the input pointer
+ vld1.u32 {d4},[r4],r11 @vector load pu1_src
+ vld1.u32 {d5},[r4],r11 @vector load pu1_src
+ vld1.u32 {d6},[r4],r11 @vector load pu1_src
+ vld1.u32 {d7},[r4] @vector load pu1_src
+ @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+ @vld1.u32 {d12,d13},[r4] @vector load pu1_src + src_strd
+ @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+
+ sub r4,r4,#2 @increment the input pointer
+ @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
+ @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+
+ vzip.32 d0,d4 @vector zip the i iteration and ii interation in single register
+ vzip.32 d1,d5
+ vzip.32 d2,d6
+ vzip.32 d3,d7
+
+ vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
+ vmlsl.u8 q4,d0,d24
+ vmlal.u8 q4,d2,d26
+ vmlsl.u8 q4,d3,d27
+
+ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result
+ vst1.32 {d8[0]},[r1]! @store the i iteration result which is in upper part of the register
+ subs r7,r7,#4 @decrement the wd by 4
+
+ vst1.32 {d8[1]},[r6]! @store the ii iteration result which is in lower part of the register
+
+ bgt inner_loop_4
+
+ sub r12,r12,r5
+ subs r14,r14,#2 @decrement the ht by 2
+ sub r1,r1,r5
+ add r12,r12,r2,lsl #1
+ add r1,r1,r3,lsl #1
+ bgt outer_loop_4
+
+end_loops:
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
new file mode 100644
index 0000000..f95937c
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s
@@ -0,0 +1,719 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_horz_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs / akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma interprediction filter to store horizontal 16bit ouput
+@*
+@* @par description:
+@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+@* to the elements pointed by 'pu1_src' and writes to the location pointed
+@* by 'pu1_dst' no downshifting or clipping is done and the output is used
+@* as an input for vertical filtering or weighted prediction
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@* word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 => src_strd
+@r3 => dst_strd
+
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_horz_w16out_a9q
+
+
+.type ihevc_inter_pred_chroma_horz_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_horz_w16out_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads pi1_coeff
+ ldr r6,[sp,#44] @loads ht
+ ldr r10,[sp,#48] @loads wd
+
+ vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
+ subs r14,r6,#0 @checks for ht == 0
+ vabs.s8 d2,d0 @vabs_s8(coeff)
+
+@******* added
+ mov r11, #2
+@******* added ends
+
+ ble end_loops
+
+ vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub r12,r0,#2 @pu1_src - 2
+ vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+ tst r10,#3 @checks wd for multiples of 4
+ mov r5,r10,lsl #1 @2wd
+
+ vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+ and r7,r14,#1 @added @calculating ht_residue ht_residue = (ht & 1)
+ sub r14,r14,r7 @added @decrement height by ht_residue(residue value is calculated outside)
+
+ bne outer_loop_4 @ this branching happens when the width is 2 or 6
+
+ cmp r10,#12
+ beq skip_16
+
+ cmp r10,#8
+ bge outer_loop_16
+
+skip_16:
+ tst r6,#3
+
+@******* removal
+ @mov r11,#8
+@******* removal ends
+
+ sub r9,r0,#2
+ beq outer_loop_ht_4 @this branching happens when the height is a a multiple of 4
+
+
+
+@ cmp r10,#12
+@ beq outer_loop_8
+@ cmp r10,#16
+@ bge outer_loop_16
+ b outer_loop_8
+
+
+
+outer_loop_16:
+ add r4,r12,r2
+
+
+ and r0, r12, #31
+ pld [r12, r2, lsl #1]
+
+
+
+
+
+
+
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ mov r10,r5 @2wd
+ mul r14,r14,r10
+ vld1.u32 {q1},[r12],r11 @vector load pu1_src
+ pld [r4, r2, lsl #1]
+ mov r9,#10
+ vld1.u32 {q2},[r12],r11 @vector load pu1_src
+ rsb r6,r3,#8
+ sub r8,r3,#8
+ vld1.u32 {q3},[r12],r9 @vector load pu1_src
+
+
+ vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q4},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {q5},[r4],r11 @vector load pu1_src
+ vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q6},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q7},[r4],r9 @vector load pu1_src
+ vmull.u8 q14,d3,d25
+ lsl r6,#1
+ rsb r3,r5,r3,lsl #1
+ vmlsl.u8 q14,d1,d24
+ lsl r8,#1
+ rsb r7,r5,r2,lsl #1
+ vmlal.u8 q14,d5,d26
+
+ vmlsl.u8 q14,d7,d27
+ cmp r14,#32
+ beq epilog_end
+ sub r14,#64
+
+inner_loop_16:
+
+ @ and r7, r12, #31 @decrement the wd loop
+ @ cmp r7, r0
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+
+
+ subs r10,r10,#16
+
+ vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+
+@ addeq r12,r12,r2,lsl #1
+@ subeq r12,r12,r5
+ addeq r12,r12,r7
+ addeq r4,r12,r2
+
+
+ vst1.16 {q15}, [r1]!
+ vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+
+
+
+
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+
+
+
+ vld1.u32 {q1},[r12],r11 @vector load pu1_src
+ vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+ vld1.u32 {q2},[r12],r11 @vector load pu1_src
+ vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vst1.16 {q14}, [r1],r8
+ vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {q3},[r12],r9 @vector load pu1_src
+ vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {q4},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+ vld1.u32 {q5},[r4],r11 @vector load pu1_src
+ vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {q6},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {q7},[r4],r9 @vector load pu1_src
+ vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vst1.16 {q11},[r1]! @store the result pu1_dst
+ vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ moveq r10,r5 @2wd
+ vmull.u8 q14,d3,d25
+
+
+
+ vmlsl.u8 q14,d1,d24
+ vst1.16 {q10},[r1],r6 @store the result pu1_dst
+
+
+ addeq r1,r1,r3,lsl #1
+ vmlal.u8 q14,d5,d26
+
+ subs r14,r14,#32 @decrement the ht loop
+ vmlsl.u8 q14,d7,d27
+
+
+
+@ mov r0, r7
+ bgt inner_loop_16
+
+
+
+ add r14,r14,#64
+ cmp r14,#32
+ beq epilog_end
+
+epilog:
+
+ vst1.16 {q15}, [r1]!
+ vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vst1.16 {q14}, [r1],r8
+
+
+
+ vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ subs r10,r10,#16 @decrement the wd loop
+ vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+@ addeq r12,r12,r2,lsl #1
+ addeq r12,r12,r7
+ vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ @ subeq r12,r12,r5
+ moveq r10,r5 @2wd
+ addeq r4,r12,r2
+ vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {q1},[r12],r11 @vector load pu1_src
+ vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q2},[r12],r11 @vector load pu1_src
+ vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q3},[r12],r9 @vector load pu1_src
+ vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+
+ vld1.u32 {q4},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {q5},[r4],r11 @vector load pu1_src
+ vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vld1.u32 {q6},[r4],r11 @vector load pu1_src
+ vmull.u8 q14,d3,d25
+ vld1.u32 {q7},[r4],r9 @vector load pu1_src
+ vmlsl.u8 q14,d1,d24
+ vst1.16 {q11},[r1]! @store the result pu1_dst
+ vmlal.u8 q14,d5,d26
+ vst1.16 {q10},[r1],r6 @store the result pu1_dst
+ vmlsl.u8 q14,d7,d27
+ addeq r1,r1,r3,lsl #1
+
+
+epilog_end:
+
+ vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+ vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+ vst1.16 {q15}, [r1]!
+ vst1.16 {q14}, [r1],r8
+ vst1.16 {q11},[r1]! @store the result pu1_dst
+ vst1.16 {q10},[r1],r6 @store the result pu1_dst
+
+
+ ldr r6,[sp,#44] @loads ht
+
+ and r7,r6,#1
+
+ cmp r7,#0
+ mov r10,r5
+ addne r12,r12,r2,lsl #1
+ subne r12,r12,r5
+ addne r1,r1,r3,lsl #1
+
+
+ bgt loop_residue_4
+
+ b end_loops
+
+
+
+
+outer_loop_8:
+
+ add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
+ mov r10,r5 @2wd
+ add r4,r12,r2 @pu1_src + src_strd
+
+inner_loop_8:
+ @vld1.u32 {d0,d1},[r12],r11 @vector load pu1_src
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11 @vector load pu1_src
+ vld1.u32 {d2},[r12],r11 @vector load pu1_src
+ vld1.u32 {d3},[r12],r11 @vector load pu1_src
+
+
+ @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ vmull.u8 q4,d1,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+ @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+ vmlal.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q4,d3,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ @vld1.u32 {d12,d13},[r4],r11 @vector load pu1_src + src_strd
+ vld1.u32 {d4},[r4],r11 @vector load pu1_src
+ vld1.u32 {d5},[r4],r11 @vector load pu1_src
+ vld1.u32 {d6},[r4],r11 @vector load pu1_src
+ vld1.u32 {d7},[r4],r11 @vector load pu1_src
+ @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
+ vmull.u8 q5,d5,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q5,d4,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+ vmlal.u8 q5,d6,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlsl.u8 q5,d7,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.16 {d8, d9}, [r1]!
+
+ subs r10,r10,#8 @decrement the wd loop
+ vst1.16 {d10, d11},[r6]! @store the result pu1_dst
+ bgt inner_loop_8
+
+ sub r12,r12,r5
+ subs r14,r14,#2 @decrement the ht loop
+ sub r1,r1,r5,lsl #1
+ add r12,r12,r2,lsl #1
+ add r1,r1,r3,lsl #2
+ bgt outer_loop_8
+
+ cmp r7,#0
+ mov r10,r5
+ bgt loop_residue_4
+
+ b end_loops
+
+
+
+@height if 4 comes
+outer_loop_ht_4:
+
+ mov r10,r5
+
+prologue_ht_4:
+ mov r8,r3,lsl #1
+
+inner_loop_ht_4:
+
+ mov r12,r9
+ mov r4,r1
+
+ sub r0, r2, #6 @ not sure if r0 needs to be preserved
+
+ vld1.u32 {d0},[r12],r11 @(1)vector load pu1_src
+ vld1.u32 {d1},[r12],r11 @(1)vector load pu1_src
+ vld1.u32 {d2},[r12],r11 @(1)vector load pu1_src
+ vld1.u32 {d3},[r12],r0 @(1)vector load pu1_src
+
+ vld1.u32 {d4},[r12],r11 @(2)vector load pu1_src
+ vld1.u32 {d5},[r12],r11 @(2)vector load pu1_src
+ vld1.u32 {d6},[r12],r11 @(2)vector load pu1_src
+ vld1.u32 {d7},[r12],r0 @(2)vector load pu1_src
+
+ vld1.u32 {d14},[r12],r11 @(3)vector load pu1_src
+ vmull.u8 q4,d1,d25 @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d15},[r12],r11 @(3)vector load pu1_src
+ vmlsl.u8 q4,d0,d24 @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d16},[r12],r11 @(3)vector load pu1_src
+ vmlal.u8 q4,d2,d26 @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {d17},[r12],r0 @(3)vector load pu1_src
+ vmlsl.u8 q4,d3,d27 @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vld1.u32 {d18},[r12],r11 @(4)vector load pu1_src
+ vmull.u8 q5,d5,d25 @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d19},[r12],r11 @(4)vector load pu1_src
+ vmlsl.u8 q5,d4,d24 @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d20},[r12],r11 @(4)vector load pu1_src
+ vmlal.u8 q5,d6,d26 @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {d21},[r12],r2 @(4)vector load pu1_src
+ vmlsl.u8 q5,d7,d27 @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ add r9,r9,#8 @(core loop)
+
+ subs r10,r10,#8 @(prologue)decrement the wd loop
+ beq epilogue
+
+core_loop:
+ vst1.16 {d8, d9},[r4],r8 @(1)store the result pu1_dst
+ mov r12,r9
+
+ vld1.u32 {d0},[r12],r11 @(1_1)vector load pu1_src
+ vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d1},[r12],r11 @(1_1)vector load pu1_src
+ vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d2},[r12],r11 @(1_1)vector load pu1_src
+ vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {d3},[r12],r0 @(1_1)vector load pu1_src
+ vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.16 {d10, d11},[r4],r8 @(2)store the result pu1_dst
+ add r9,r9,#8 @(core loop)
+
+ vld1.u32 {d4},[r12],r11 @(2_1)vector load pu1_src
+ vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d5},[r12],r11 @(2_1)vector load pu1_src
+ vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d6},[r12],r11 @(2_1)vector load pu1_src
+ vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {d7},[r12],r0 @(2_1)vector load pu1_src
+ vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.16 {d12, d13},[r4],r8 @(3)store the result pu1_dst
+ add r1,r1,#16 @(core loop)
+
+ vld1.u32 {d14},[r12],r11 @(3_1)vector load pu1_src
+ vmull.u8 q4,d1,d25 @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vld1.u32 {d15},[r12],r11 @(3_1)vector load pu1_src
+ vmlsl.u8 q4,d0,d24 @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d16},[r12],r11 @(3_1)vector load pu1_src
+ vmlal.u8 q4,d2,d26 @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vld1.u32 {d17},[r12],r0 @(3_1)vector load pu1_src
+ vmlsl.u8 q4,d3,d27 @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.16 {d22, d23}, [r4], r8 @(4)store the result pu1_dst
+ subs r10,r10,#8 @(core loop)
+
+ vmull.u8 q5,d5,d25 @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {d18},[r12],r11 @(4_1)vector load pu1_src
+
+ vld1.u32 {d19},[r12],r11 @(4_1)vector load pu1_src
+ vmlsl.u8 q5,d4,d24 @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vld1.u32 {d20},[r12],r11 @(4_1)vector load pu1_src
+ vmlal.u8 q5,d6,d26 @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ mov r4, r1 @(core loop)
+
+ vld1.u32 {d21},[r12],r0 @(4_1)vector load pu1_src
+ vmlsl.u8 q5,d7,d27 @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+
+ bgt core_loop @loopback
+
+epilogue:
+ vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.16 {d8, d9},[r4], r8 @(1)store the result pu1_dst
+
+ vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vst1.16 {d10, d11},[r4], r8 @(2)store the result pu1_dst
+
+ vst1.16 {d12, d13},[r4], r8 @(3)store the result pu1_dst
+
+ add r1,r1,#16 @(core loop)
+
+ vst1.16 {d22, d23},[r4], r8 @(4)store the result pu1_dst
+
+ sub r9,r9,r5
+ subs r14,r14,#4 @decrement the ht loop
+ sub r1,r1,r5,lsl #1
+ add r9,r9,r2,lsl #2
+ add r1,r1,r3,lsl #3
+ bgt outer_loop_ht_4
+
+ cmp r7,#0
+ mov r10,r5
+ movgt r12,r9
+ movgt r4,r1
+ bgt loop_residue_4
+
+ b end_loops
+
+outer_loop_4:
+ add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
+ mov r10,r5
+ add r4,r12,r2 @pu1_src + src_strd
+
+inner_loop_4:
+ @vld1.u32 {d0,d1},[r12] @vector load pu1_src
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11 @vector load pu1_src
+ vld1.u32 {d2},[r12],r11 @vector load pu1_src
+ vld1.u32 {d3},[r12] @vector load pu1_src
+
+@**** removal
+ @add r12,r12,#4 @increment the input pointer
+@**** removal ends
+@**** addn
+ sub r12,r12,#2 @increment the input pointer
+@**** addn ends
+ vld1.u32 {d4},[r4],r11 @vector load pu1_src
+ vld1.u32 {d5},[r4],r11 @vector load pu1_src
+ vld1.u32 {d6},[r4],r11 @vector load pu1_src
+ vld1.u32 {d7},[r4] @vector load pu1_src
+ @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+ @vld1.u32 {d12,d13},[r4] @vector load pu1_src + src_strd
+ @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+
+ @add r4,r4,#4 @increment the input pointer
+ sub r4,r4,#2
+ @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
+ @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+
+@**** removal
+ @vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register
+ @vzip.32 d2,d14
+ @vzip.32 d4,d16
+ @vzip.32 d6,d18
+@**** removal ends
+@**** addn
+ vzip.32 d0,d4 @vector zip the i iteration and ii interation in single register
+ vzip.32 d1,d5
+ vzip.32 d2,d6
+ vzip.32 d3,d7
+@**** addn ends
+
+ vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
+ vmlsl.u8 q4,d0,d24
+ vmlal.u8 q4,d2,d26
+ vmlsl.u8 q4,d3,d27
+
+ vst1.32 {d8},[r1]! @store the i iteration result which is in upper part of the register
+ subs r10,r10,#4 @decrement the wd by 4
+
+ vst1.32 {d9},[r6]! @store the ii iteration result which is in lower part of the register
+
+ bgt inner_loop_4
+
+ sub r12,r12,r5
+ subs r14,r14,#2 @decrement the ht by 2
+ sub r1,r1,r5,lsl #1
+ add r12,r12,r2,lsl #1
+ add r1,r1,r3,lsl #2
+ bgt outer_loop_4
+
+ cmp r7,#0
+ mov r10,r5
+ beq end_loops
+
+loop_residue_4:
+
+ mov r10,r5 @2wd
+
+loop_residue:
+
+ @vld1.u32 {d0,d1},[r12] @vector load pu1_src
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11 @vector load pu1_src
+ vld1.u32 {d2},[r12],r11 @vector load pu1_src
+ vld1.u32 {d3},[r12] @vector load pu1_src
+ @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @vmull.u8 q4,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ @vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+ @add r12,r12,#4 @pu1_src + 4
+ sub r12, r12, #2
+ @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+ @vmlal.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ @vmlsl.u8 q4,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vmull.u8 q4,d1,d25
+ vmlsl.u8 q4,d0,d24
+ vmlal.u8 q4,d2,d26
+ vmlsl.u8 q4,d3,d27
+
+ vst1.64 {d8 },[r1] @store the result pu1_dst
+ subs r10,r10,#4 @decrement the wd loop
+ add r1,r1,#8 @pi2_dst + 8
+
+ bgt loop_residue @loop again
+
+ @inner loop ends
+ @add r8,r3,lsl #1 @2*dst_strd
+ @sub r8,r8,r5,lsl #1 @2*dst_strd - 2wd
+ @sub r9,r2,r5 @src_strd - 2wd
+ @subs r7,r7,#1 @decrement the ht loop
+ @add r12,r12,r9 @pu1_src + src_strd
+ @add r1,r1,r8 @pu1_dst + 2*dst_strd
+ @bgt outer_loop_residue_4 @loop again
+ @b end_loops @jumps to end
+
+end_loops:
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_chroma_vert.s b/common/arm/ihevc_inter_pred_chroma_vert.s
new file mode 100644
index 0000000..e786497
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert.s
@@ -0,0 +1,383 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_vert_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma interprediction filter for vertical input
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' the output is down shifted by 6 and clipped to 8 bits
+@* assumptions : the function is optimized considering the fact width is
+@* multiple of 2,4 or 8. and also considering height should be multiple of 2
+@* width 4,8 is optimized further
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 => src_strd
+@r3 => dst_strd
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_a9q
+
+.type ihevc_inter_pred_chroma_vert_a9q, %function
+
+ihevc_inter_pred_chroma_vert_a9q:
+
+ stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#44] @loads ht
+ ldr r12,[sp,#40] @loads pi1_coeff
+ cmp r4,#0 @checks ht == 0
+ ldr r6,[sp,#48] @loads wd
+ sub r0,r0,r2 @pu1_src - src_strd
+ vld1.8 {d0},[r12] @loads pi1_coeff
+
+ ble end_loops @jumps to end
+
+ tst r6,#3 @checks (wd & 3)
+ vabs.s8 d3,d0 @vabs_s8(coeff)
+ lsl r10,r6,#1 @2*wd
+ vdup.8 d0,d3[0] @coeffabs_0
+ vdup.8 d1,d3[1] @coeffabs_1
+ vdup.8 d2,d3[2] @coeffabs_2
+ vdup.8 d3,d3[3] @coeffabs_3
+
+ bgt outer_loop_wd_2 @jumps to loop handling wd ==2
+
+ tst r4,#7 @checks ht for mul of 8
+ beq core_loop_ht_8 @when height is multiple of 8
+
+ lsl r7,r3,#1 @2*dst_strd
+ sub r9,r7,r10 @2*dst_strd - 2wd
+ lsl r12,r2,#1 @2*src_strd
+ sub r8,r12,r10 @2*src_strd - 2wd
+ mov r5,r10 @2wd
+
+inner_loop_ht_2: @called when wd is multiple of 4 and ht is 4,2
+
+ add r6,r0,r2 @pu1_src +src_strd
+ vld1.8 {d9},[r6],r2 @loads pu1_src
+ subs r5,r5,#8 @2wd - 8
+ vld1.8 {d5},[r0]! @loads src
+ vmull.u8 q3,d9,d1 @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ vld1.8 {d4},[r6],r2 @loads incremented src
+ vmlsl.u8 q3,d5,d0 @vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+ vld1.8 {d8},[r6],r2 @loads incremented src
+ vmlal.u8 q3,d4,d2 @vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+ vmull.u8 q2,d4,d1
+ vmlsl.u8 q3,d8,d3
+ vmlsl.u8 q2,d9,d0
+ vld1.8 {d10},[r6] @loads the incremented src
+ vmlal.u8 q2,d8,d2
+ vqrshrun.s16 d6,q3,#6 @shifts right
+ vmlsl.u8 q2,d10,d3
+ add r6,r1,r3 @pu1_dst + dst_strd
+ vqrshrun.s16 d4,q2,#6 @shifts right
+ vst1.8 {d6},[r1]! @stores the loaded value
+
+ vst1.8 {d4},[r6] @stores the loaded value
+
+ bgt inner_loop_ht_2 @inner loop again
+
+ subs r4,r4,#2 @ht - 2
+ add r1,r1,r9 @pu1_dst += (2*dst_strd - 2wd)
+ mov r5,r10 @2wd
+ add r0,r0,r8 @pu1_src += (2*src_strd - 2wd)
+
+ bgt inner_loop_ht_2 @loop again
+
+ b end_loops @jumps to end
+
+outer_loop_wd_2: @called when width is multiple of 2
+ lsl r5,r3,#1 @2*dst_strd
+ mov r12,r10 @2wd
+ sub r9,r5,r10 @2*dst_strd - 2wd
+ lsl r7,r2,#1 @2*src_strd
+ sub r8,r7,r10 @2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+ add r6,r0,r2 @pu1_src + src_strd
+ vld1.32 {d6[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+ subs r12,r12,#4 @2wd - 4
+ add r0,r0,#4 @pu1_src + 4
+ vld1.32 {d6[1]},[r6],r2 @loads pu1_src_tmp
+ vdup.32 d7,d6[1]
+ vld1.32 {d7[1]},[r6],r2 @loads pu1_src_tmp
+ vmull.u8 q2,d7,d1 @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ vdup.32 d7,d7[1]
+ vld1.32 {d7[1]},[r6],r2
+ vmlsl.u8 q2,d6,d0
+ vmlal.u8 q2,d7,d2
+ vdup.32 d7,d7[1]
+ vld1.32 {d7[1]},[r6]
+ add r6,r1,r3 @pu1_dst + dst_strd
+ vmlsl.u8 q2,d7,d3
+ vqrshrun.s16 d4,q2,#6 @vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
+ vst1.32 {d4[0]},[r1] @stores the loaded value
+ add r1,r1,#4 @pu1_dst += 4
+ vst1.32 {d4[1]},[r6] @stores the loaded value
+
+ bgt inner_loop_wd_2 @inner loop again
+
+ @inner loop ends
+ subs r4,r4,#2 @ht - 2
+ add r1,r1,r9 @pu1_dst += 2*dst_strd - 2*wd
+ mov r12,r10 @2wd
+ add r0,r0,r8 @pu1_src += 2*src_strd - 2*wd
+
+ bgt inner_loop_wd_2 @loop again
+
+ b end_loops @jumps to end
+
+core_loop_ht_8: @when wd & ht is multiple of 8
+
+ lsl r12,r3,#2 @4*dst_strd
+ sub r8,r12,r10 @4*dst_strd - 2wd
+ lsl r12,r2,#2 @4*src_strd
+ sub r9,r12,r10 @4*src_strd - 2wd
+
+ bic r5,r10,#7 @r5 ->wd
+ mov r14,r10,lsr #3 @divide by 8
+ mul r12,r4,r14 @multiply height by width
+ sub r12,#4 @subtract by one for epilog
+
+prolog:
+ add r6,r0,r2 @pu1_src + src_strd
+ vld1.8 {d5},[r6],r2 @loads pu1_src
+ subs r5,r5,#8 @2wd - 8
+ vld1.8 {d4},[r0]! @loads the source
+ vld1.8 {d6},[r6],r2 @load and increment
+ vmull.u8 q15,d5,d1 @mul with coeff 1
+ vld1.8 {d7},[r6],r2 @load and increment
+ vmlsl.u8 q15,d4,d0
+ add r7,r1,r3 @pu1_dst
+ vmlal.u8 q15,d6,d2
+ vmlsl.u8 q15,d7,d3
+ vld1.8 {d8},[r6],r2 @load and increment
+
+ vmull.u8 q14,d6,d1 @mul_res 2
+ addle r0,r0,r9 @pu1_dst += 4*dst_strd - 2*wd
+ vmlsl.u8 q14,d5,d0
+ bicle r5,r10,#7 @r5 ->wd
+ vmlal.u8 q14,d7,d2
+ vld1.8 {d9},[r6],r2
+ vmlsl.u8 q14,d8,d3
+ vqrshrun.s16 d30,q15,#6
+
+ vld1.8 {d10},[r6],r2
+ vmull.u8 q13,d7,d1
+ add r6,r0,r2 @pu1_src + src_strd
+ vmlsl.u8 q13,d6,d0
+ vst1.8 {d30},[r1]! @stores the loaded value
+ vmlal.u8 q13,d8,d2
+ vld1.8 {d4},[r0]! @loads the source
+ vmlsl.u8 q13,d9,d3
+ vqrshrun.s16 d28,q14,#6
+
+ addle r1,r1,r8 @pu1_src += 4*src_strd - 2*wd
+ vmull.u8 q12,d8,d1
+ vld1.8 {d5},[r6],r2 @loads pu1_src
+ vmlsl.u8 q12,d7,d0
+ subs r12,r12,#4
+ vld1.8 {d6},[r6],r2 @load and increment
+ vmlal.u8 q12,d9,d2
+ vld1.8 {d7},[r6],r2 @load and increment
+ vmlsl.u8 q12,d10,d3
+
+ lsl r11,r2,#2
+ vst1.8 {d28},[r7],r3 @stores the loaded value
+ vqrshrun.s16 d26,q13,#6
+ rsb r11,r2,r2,lsl #3
+ add r14,r2,r2,lsl #1
+ add r14,r14,r11
+ ble epilog @jumps to epilog
+
+kernel_8:
+
+ vmull.u8 q15,d5,d1 @mul with coeff 1
+ subs r5,r5,#8 @2wd - 8
+ vmlsl.u8 q15,d4,d0
+ addle r0,r0,r9 @pu1_dst += 4*dst_strd - 2*wd
+ vmlal.u8 q15,d6,d2
+ rsble r11,r2,r2,lsl #3
+ vmlsl.u8 q15,d7,d3
+ vst1.8 {d26},[r7],r3 @stores the loaded value
+ vqrshrun.s16 d24,q12,#6
+
+ vld1.8 {d8},[r6],r2 @load and increment
+
+ vmull.u8 q14,d6,d1 @mul_res 2
+ bicle r5,r10,#7 @r5 ->wd
+ vmlsl.u8 q14,d5,d0
+ vst1.8 {d24},[r7],r3 @stores the loaded value
+
+ vmlal.u8 q14,d7,d2
+
+ vld1.8 {d9},[r6],r2
+ vqrshrun.s16 d30,q15,#6
+
+ vmlsl.u8 q14,d8,d3
+ vld1.8 {d10},[r6],r2
+ add r7,r1,r3 @pu1_dst
+ vmull.u8 q13,d7,d1
+ add r6,r0,r2 @pu1_src + src_strd
+
+ pld [r0,r11]
+
+
+ vmlsl.u8 q13,d6,d0
+ vld1.8 {d4},[r0]! @loads the source
+
+ vmlal.u8 q13,d8,d2
+ vst1.8 {d30},[r1]! @stores the loaded value
+
+ vmlsl.u8 q13,d9,d3
+ vld1.8 {d5},[r6],r2 @loads pu1_src
+
+ add r11,r11,r2
+ vqrshrun.s16 d28,q14,#6
+
+ vmull.u8 q12,d8,d1
+ vld1.8 {d6},[r6],r2 @load and increment
+ addle r1,r1,r8 @pu1_src += 4*src_strd - 2*wd
+
+ cmp r11,r14
+ rsbgt r11,r2,r2,lsl #3
+
+ vmlsl.u8 q12,d7,d0
+ subs r12,r12,#4
+
+ vmlal.u8 q12,d9,d2
+ vld1.8 {d7},[r6],r2 @load and increment
+
+ vmlsl.u8 q12,d10,d3
+ vst1.8 {d28},[r7],r3 @stores the loaded value
+ vqrshrun.s16 d26,q13,#6
+
+ bgt kernel_8 @jumps to kernel_8
+
+epilog:
+
+ vmull.u8 q15,d5,d1 @mul with coeff 1
+ vmlsl.u8 q15,d4,d0
+ vmlal.u8 q15,d6,d2
+ vmlsl.u8 q15,d7,d3
+ vst1.8 {d26},[r7],r3 @stores the loaded value
+ vqrshrun.s16 d24,q12,#6
+
+ vld1.8 {d8},[r6],r2 @load and increment
+ vmull.u8 q14,d6,d1 @mul_res 2
+ vmlsl.u8 q14,d5,d0
+ vmlal.u8 q14,d7,d2
+ vmlsl.u8 q14,d8,d3
+ vst1.8 {d24},[r7],r3 @stores the loaded value
+ vqrshrun.s16 d30,q15,#6
+
+ vld1.8 {d9},[r6],r2
+ vmull.u8 q13,d7,d1
+ add r7,r1,r3 @pu1_dst
+ vmlsl.u8 q13,d6,d0
+ vst1.8 {d30},[r1]! @stores the loaded value
+
+ vqrshrun.s16 d28,q14,#6
+ vmlal.u8 q13,d8,d2
+ vld1.8 {d10},[r6],r2
+ vmlsl.u8 q13,d9,d3
+
+ vmull.u8 q12,d8,d1
+ vqrshrun.s16 d26,q13,#6
+ vst1.8 {d28},[r7],r3 @stores the loaded value
+ vmlsl.u8 q12,d7,d0
+ vmlal.u8 q12,d9,d2
+ vst1.8 {d26},[r7],r3 @stores the loaded value
+ vmlsl.u8 q12,d10,d3
+
+ vqrshrun.s16 d24,q12,#6
+ vst1.8 {d24},[r7],r3 @stores the loaded value
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
new file mode 100644
index 0000000..ba2ea8e
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -0,0 +1,342 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs / parthiban
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma interprediction filter for 16bit vertical input.
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
+@* clipped to lie between 0 and 255 assumptions : the function is
+@* optimized considering the fact width and height are multiple of 2.
+@*
+@* @param[in] pi2_src
+@* word16 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 => src_strd
+@r3 => dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_a9q
+
+.type ihevc_inter_pred_chroma_vert_w16inp_a9q, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4, [sp,#40] @loads pi1_coeff
+ ldr r6, [sp,#48] @wd
+ lsl r2,r2,#1 @src_strd = 2* src_strd
+ ldr r5,[sp,#44] @loads ht
+ vld1.8 {d0},[r4] @loads pi1_coeff
+ sub r4,r0,r2 @pu1_src - src_strd
+ vmovl.s8 q0,d0 @long the value
+
+ tst r6,#3 @checks wd == 2
+ vdup.16 d12,d0[0] @coeff_0
+ vdup.16 d13,d0[1] @coeff_1
+ vdup.16 d14,d0[2] @coeff_2
+ vdup.16 d15,d0[3] @coeff_3
+
+ bgt core_loop_ht_2 @jumps to loop handles wd 2
+
+ tst r5,#3 @checks ht == mul of 4
+ beq core_loop_ht_4 @jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+ lsl r7,r2,#1 @2*src_strd
+ lsl r12,r3,#1 @2*dst_strd
+ lsl r9,r6,#2 @4*wd
+ sub r6,r12,r6,lsl #1 @2*dst_strd - 2*wd
+ sub r8,r7,r9 @2*src_strd - 4*wd
+ mov r12,r9 @4wd
+
+inner_loop_ht_2:
+ add r0,r4,r2 @increments pi2_src
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vmull.s16 q0,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ subs r12,r12,#8 @2wd + 8
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmull.s16 q4,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.16 {d3},[r0],r2 @loads pi2_src
+ vmlal.s16 q0,d2,d13
+ vld1.16 {d6},[r0],r2
+ vmlal.s16 q4,d3,d13
+ vld1.16 {d2},[r0]
+ add r7,r1,r3 @pu1_dst + dst_strd
+ vmlal.s16 q0,d3,d14
+ vmlal.s16 q4,d6,d14
+ vmlal.s16 q0,d6,d15
+ vmlal.s16 q4,d2,d15
+ vqshrn.s32 d0,q0,#6 @right shift
+ vqshrn.s32 d30,q4,#6 @right shift
+ vqrshrun.s16 d0,q0,#6 @rounding shift
+ vqrshrun.s16 d30,q15,#6 @rounding shift
+ vst1.32 {d0[0]},[r1]! @stores the loaded value
+ vst1.32 {d30[0]},[r7] @stores the loaded value
+ bgt inner_loop_ht_2 @inner loop -again
+
+ @inner loop ends
+ subs r5,r5,#2 @increments ht
+ add r1,r1,r6 @pu1_dst += 2*dst_strd - 2*wd
+ mov r12,r9 @4wd
+ add r4,r4,r8 @pi1_src_tmp1 += 2*src_strd - 4*wd
+ bgt inner_loop_ht_2 @loop again
+
+ b end_loops @jumps to end
+
+core_loop_ht_4:
+ lsl r7,r2,#2 @2*src_strd
+ lsl r12,r3,#2 @2*dst_strd
+ mov r11,r6,lsr #1 @divide by 2
+ sub lr,r12,r6,lsl #1 @2*dst_strd - 2*wd
+ sub r8,r7,r6,lsl #2 @2*src_strd - 4*wd
+
+ mul r12,r5,r11 @multiply height by width
+ sub r12,#4 @subtract by one for epilog
+ mov r11,r6,lsl #1 @2*wd
+
+prolog:
+ add r0,r4,r2 @increments pi2_src
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vld1.16 {d1},[r0],r2 @loads pi2_src
+ subs r11,r11,#4
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ vld1.16 {d3},[r0],r2
+ vmlal.s16 q15,d1,d13
+ vmlal.s16 q15,d2,d14
+ add r9,r1,r3 @pu1_dst + dst_strd
+ vmlal.s16 q15,d3,d15
+
+ vld1.16 {d4},[r0],r2
+ vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
+ addle r4,r4,r8
+ vmlal.s16 q14,d2,d13
+ vld1.s16 {d5},[r0],r2
+ vmlal.s16 q14,d3,d14
+ vld1.s16 {d6},[r0],r2
+ vmlal.s16 q14,d4,d15
+ movle r11,r6,lsl #1
+
+ vqshrn.s32 d30,q15,#6 @right shift
+
+ vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ add r0,r4,r2
+ vmlal.s16 q13,d3,d13
+ vmlal.s16 q13,d4,d14
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vmlal.s16 q13,d5,d15
+
+ vqrshrun.s16 d30,q15,#6 @rounding shift
+ vqshrn.s32 d28,q14,#6 @right shift
+
+ vld1.16 {d1},[r0],r2 @loads pi2_src
+ vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
+ vst1.32 {d30[0]},[r1]! @stores the loaded value
+ vmlal.s16 q12,d4,d13
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmlal.s16 q12,d5,d14
+ vld1.16 {d3},[r0],r2
+ vmlal.s16 q12,d6,d15
+ addle r1,r1,lr
+
+ vqshrn.s32 d26,q13,#6 @right shift
+ subs r12,r12,#4
+ vqrshrun.s16 d28,q14,#6 @rounding shift
+
+ beq epilog @jumps to epilog
+
+kernel_4:
+ vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ subs r11,r11,#4
+ vmlal.s16 q15,d1,d13
+ vst1.32 {d28[0]},[r9],r3 @stores the loaded value
+ vmlal.s16 q15,d2,d14
+ vmlal.s16 q15,d3,d15
+
+ vqshrn.s32 d24,q12,#6 @right shift
+ vqrshrun.s16 d26,q13,#6 @rounding shift
+
+ vld1.16 {d4},[r0],r2
+ vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
+ vmlal.s16 q14,d2,d13
+ vmlal.s16 q14,d3,d14
+ vmlal.s16 q14,d4,d15
+ vst1.32 {d26[0]},[r9],r3 @stores the loaded value
+ addle r4,r4,r8
+ movle r11,r6,lsl #1
+
+ vqshrn.s32 d30,q15,#6 @right shift
+ vqrshrun.s16 d24,q12,#6 @rounding shift
+
+ vld1.s16 {d5},[r0],r2
+ vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.s16 {d6},[r0],r2
+ vmlal.s16 q13,d3,d13
+ vst1.32 {d24[0]},[r9] @stores the loaded value
+ add r0,r4,r2
+ vmlal.s16 q13,d4,d14
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vmlal.s16 q13,d5,d15
+
+ vqshrn.s32 d28,q14,#6 @right shift
+ vqrshrun.s16 d30,q15,#6 @rounding shift
+
+ vld1.16 {d1},[r0],r2 @loads pi2_src
+ vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
+ add r9,r1,r3 @pu1_dst + dst_strd
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmlal.s16 q12,d4,d13
+ vld1.16 {d3},[r0],r2
+ vmlal.s16 q12,d5,d14
+
+ vst1.32 {d30[0]},[r1]! @stores the loaded value
+ vmlal.s16 q12,d6,d15
+
+ vqshrn.s32 d26,q13,#6 @right shift
+ vqrshrun.s16 d28,q14,#6 @rounding shift
+ addle r1,r1,lr
+
+ subs r12,r12,#4
+
+ bgt kernel_4 @jumps to kernel_4
+
+epilog:
+ vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ vst1.32 {d28[0]},[r9],r3 @stores the loaded value
+ vmlal.s16 q15,d1,d13
+ vmlal.s16 q15,d2,d14
+ vmlal.s16 q15,d3,d15
+
+ vqshrn.s32 d24,q12,#6 @right shift
+ vqrshrun.s16 d26,q13,#6 @rounding shift
+
+ vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.16 {d4},[r0],r2
+ vmlal.s16 q14,d2,d13
+ vst1.32 {d26[0]},[r9],r3 @stores the loaded value
+ vmlal.s16 q14,d3,d14
+ vmlal.s16 q14,d4,d15
+
+ vqshrn.s32 d30,q15,#6 @right shift
+ vqrshrun.s16 d24,q12,#6 @rounding shift
+
+ vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.s16 {d5},[r0],r2
+ vmlal.s16 q13,d3,d13
+ vmlal.s16 q13,d4,d14
+ vmlal.s16 q13,d5,d15
+
+ vqshrn.s32 d28,q14,#6 @right shift
+ vqrshrun.s16 d30,q15,#6 @rounding shift
+
+ vst1.32 {d24[0]},[r9] @stores the loaded value
+ vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
+ vmlal.s16 q12,d4,d13
+ add r9,r1,r3 @pu1_dst + dst_strd
+ vld1.s16 {d6},[r0],r2
+ vmlal.s16 q12,d5,d14
+ vmlal.s16 q12,d6,d15
+ vst1.32 {d30[0]},[r1]! @stores the loaded value
+
+ vqrshrun.s16 d28,q14,#6 @rounding shift
+ vqshrn.s32 d26,q13,#6 @right shift
+
+ vst1.32 {d28[0]},[r9],r3 @stores the loaded value
+ vqrshrun.s16 d26,q13,#6 @rounding shift
+
+ vqshrn.s32 d24,q12,#6 @right shift
+ vst1.32 {d26[0]},[r9],r3 @stores the loaded value
+ vqrshrun.s16 d24,q12,#6 @rounding shift
+
+ vst1.32 {d24[0]},[r9] @stores the loaded value
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..00b3011
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -0,0 +1,329 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs / parthiban
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma interprediction filter for 16bit vertical input and output.
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' input is 16 bits the filter output is downshifted by 6 and
+@* 8192 is subtracted to store it as a 16 bit number the output is used as
+@* a input to weighted prediction assumptions : the function is optimized
+@* considering the fact width and height are multiple of 2.
+@*
+@* @param[in] pi2_src
+@* word16 pointer to the source
+@*
+@* @param[out] pi2_dst
+@* word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 => src_strd
+@r3 => dst_strd
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q
+
+.type ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4, [sp,#40] @loads pi1_coeff
+ ldr r6, [sp,#48] @wd
+ lsl r2,r2,#1 @src_strd = 2* src_strd
+ ldr r5,[sp,#44] @loads ht
+ vld1.8 {d0},[r4] @loads pi1_coeff
+ sub r4,r0,r2 @pu1_src - src_strd
+ vmovl.s8 q0,d0 @long the value
+
+ tst r6,#3 @checks wd == 2
+ vdup.16 d12,d0[0] @coeff_0
+ vdup.16 d13,d0[1] @coeff_1
+ vdup.16 d14,d0[2] @coeff_2
+ vdup.16 d15,d0[3] @coeff_3
+
+ bgt core_loop_ht_2 @jumps to loop handles wd 2
+
+ tst r5,#3 @checks ht == mul of 4
+ beq core_loop_ht_4 @jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+ lsl r7,r2,#1 @2*src_strd
+ lsl r3,r3,#1 @2*dst_strd
+ lsl r9,r6,#2 @4*wd
+ sub r6,r3,r6,lsl #1 @2*dst_strd - 2*wd
+ sub r8,r7,r9 @2*src_strd - 4*wd
+ mov r12,r9 @4wd
+
+inner_loop_ht_2:
+ add r0,r4,r2 @increments pi2_src
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vmull.s16 q0,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ subs r12,r12,#8 @2wd + 8
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmull.s16 q4,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.16 {d3},[r0],r2 @loads pi2_src
+ vmlal.s16 q0,d2,d13
+ vld1.16 {d6},[r0],r2
+ vmlal.s16 q4,d3,d13
+ vld1.16 {d2},[r0]
+ add r7,r1,r3 @pu1_dst + dst_strd
+ vmlal.s16 q0,d3,d14
+ vmlal.s16 q4,d6,d14
+ vmlal.s16 q0,d6,d15
+ vmlal.s16 q4,d2,d15
+ vqshrn.s32 d0,q0,#6 @right shift
+ vqshrn.s32 d30,q4,#6 @right shift
+ vst1.32 {d0},[r1]! @stores the loaded value
+ vst1.32 {d30},[r7] @stores the loaded value
+ bgt inner_loop_ht_2 @inner loop -again
+
+ @inner loop ends
+ subs r5,r5,#2 @increments ht
+ add r1,r1,r6,lsl #1 @pu1_dst += 2*dst_strd - 2*wd
+ mov r12,r9 @4wd
+ add r4,r4,r8 @pi1_src_tmp1 += 2*src_strd - 4*wd
+ bgt inner_loop_ht_2 @loop again
+
+ b end_loops @jumps to end
+
+core_loop_ht_4:
+ lsl r7,r2,#2 @2*src_strd
+ lsl r10,r3,#2 @2*dst_strd
+ mov r11,r6,lsr #1 @divide by 2
+ sub lr,r10,r6,lsl #1 @2*dst_strd - 2*wd
+ sub r8,r7,r6,lsl #2 @2*src_strd - 4*wd
+
+ mul r12,r5,r11 @multiply height by width
+ sub r12,#4 @subtract by one for epilog
+ mov r11,r6,lsl #1 @2*wd
+ lsl r3,r3,#1 @2*dst_strd
+
+prolog:
+ add r0,r4,r2 @increments pi2_src
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vld1.16 {d1},[r0],r2 @loads pi2_src
+ subs r11,r11,#4
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ vld1.16 {d3},[r0],r2
+ vmlal.s16 q15,d1,d13
+ vmlal.s16 q15,d2,d14
+ add r9,r1,r3 @pu1_dst + dst_strd
+ vmlal.s16 q15,d3,d15
+
+ vld1.16 {d4},[r0],r2
+ vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
+ addle r4,r4,r8
+ movle r11,r6,lsl #1
+ vmlal.s16 q14,d2,d13
+ vmlal.s16 q14,d3,d14
+ vld1.s16 {d5},[r0],r2
+ vmlal.s16 q14,d4,d15
+
+ vqshrn.s32 d30,q15,#6 @right shift
+
+ vld1.s16 {d6},[r0],r2
+ vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ vmlal.s16 q13,d3,d13
+ vmlal.s16 q13,d4,d14
+ add r0,r4,r2
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vmlal.s16 q13,d5,d15
+
+ vqshrn.s32 d28,q14,#6 @right shift
+
+ vld1.16 {d1},[r0],r2 @loads pi2_src
+ vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
+ vst1.32 {d30},[r1]! @stores the loaded value
+ vmlal.s16 q12,d4,d13
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmlal.s16 q12,d5,d14
+ vld1.16 {d3},[r0],r2
+ vmlal.s16 q12,d6,d15
+ addle r1,r1,lr,lsl #1
+
+ vqshrn.s32 d26,q13,#6 @right shift
+ subs r12,r12,#4
+
+ beq epilog @jumps to epilog
+
+kernel_4:
+ vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ subs r11,r11,#4
+ vmlal.s16 q15,d1,d13
+ vst1.32 {d28},[r9],r3 @stores the loaded value
+ vmlal.s16 q15,d2,d14
+ vmlal.s16 q15,d3,d15
+
+ vqshrn.s32 d24,q12,#6 @right shift
+
+ vld1.16 {d4},[r0],r2
+ vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
+ vmlal.s16 q14,d2,d13
+ vmlal.s16 q14,d3,d14
+ vmlal.s16 q14,d4,d15
+ vst1.32 {d26},[r9],r3 @stores the loaded value
+ addle r4,r4,r8
+ movle r11,r6,lsl #1
+
+ vqshrn.s32 d30,q15,#6 @right shift
+
+ vld1.s16 {d5},[r0],r2
+ vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.s16 {d6},[r0],r2
+ vmlal.s16 q13,d3,d13
+ vst1.32 {d24},[r9] @stores the loaded value
+ add r0,r4,r2
+ vmlal.s16 q13,d4,d14
+ vld1.16 {d0},[r4]! @loads pu1_src
+ vmlal.s16 q13,d5,d15
+
+ vqshrn.s32 d28,q14,#6 @right shift
+
+ vld1.16 {d1},[r0],r2 @loads pi2_src
+ vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.16 {d2},[r0],r2 @loads pi2_src
+ vmlal.s16 q12,d4,d13
+ add r9,r1,r3 @pu1_dst + dst_strd
+ vld1.16 {d3},[r0],r2
+ vmlal.s16 q12,d5,d14
+
+ vst1.32 {d30},[r1]! @stores the loaded value
+ vmlal.s16 q12,d6,d15
+
+ vqshrn.s32 d26,q13,#6 @right shift
+ addle r1,r1,lr,lsl #1
+
+ subs r12,r12,#4
+
+ bgt kernel_4 @jumps to kernel_4
+
+epilog:
+ vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0)
+ vst1.32 {d28},[r9],r3 @stores the loaded value
+ vmlal.s16 q15,d1,d13
+ vmlal.s16 q15,d2,d14
+ vmlal.s16 q15,d3,d15
+
+ vqshrn.s32 d24,q12,#6 @right shift
+
+ vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.16 {d4},[r0],r2
+ vmlal.s16 q14,d2,d13
+ vst1.32 {d26},[r9],r3 @stores the loaded value
+ vmlal.s16 q14,d3,d14
+ vmlal.s16 q14,d4,d15
+
+ vqshrn.s32 d30,q15,#6 @right shift
+
+ vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0)
+ vld1.s16 {d5},[r0],r2
+ vmlal.s16 q13,d3,d13
+ vmlal.s16 q13,d4,d14
+ vmlal.s16 q13,d5,d15
+
+ vqshrn.s32 d28,q14,#6 @right shift
+
+ vst1.32 {d24},[r9] @stores the loaded value
+ vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0)
+ vmlal.s16 q12,d4,d13
+ add r9,r1,r3 @pu1_dst + dst_strd
+ vld1.s16 {d6},[r0],r2
+ vmlal.s16 q12,d5,d14
+ vmlal.s16 q12,d6,d15
+ vst1.32 {d30},[r1]! @stores the loaded value
+
+ vqshrn.s32 d26,q13,#6 @right shift
+
+ vst1.32 {d28},[r9],r3 @stores the loaded value
+
+ vqshrn.s32 d24,q12,#6 @right shift
+ vst1.32 {d26},[r9],r3 @stores the loaded value
+
+ vst1.32 {d24},[r9] @stores the loaded value
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
new file mode 100644
index 0000000..6e6776c
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s
@@ -0,0 +1,367 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_inter_pred_chroma_vert_w16out_neon.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs/ pathiban
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction chroma filter to store vertical 16bit ouput
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' no downshifting or clipping is done and the output is used as
+@* an input for weighted prediction assumptions : the function is optimized
+@* considering the fact width is multiple of 2,4 or 8. and also considering
+@* height should be multiple of 2. width 4,8 is optimized further
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@* word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*****************************************************************************
+@*/
+@void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd)
+@**************variables vs registers*****************************************
+@r0 => *pu1_src
+@r1 => *pi2_dst
+@r2 => src_strd
+@r3 => dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_chroma_vert_w16out_a9q
+
+.type ihevc_inter_pred_chroma_vert_w16out_a9q, %function
+
+ihevc_inter_pred_chroma_vert_w16out_a9q:
+
+ stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#44] @loads ht
+ ldr r12,[sp,#40] @loads pi1_coeff
+ cmp r4,#0 @checks ht == 0
+ ldr r6,[sp,#48] @loads wd
+ sub r0,r0,r2 @pu1_src - src_strd
+ vld1.8 {d0},[r12] @loads pi1_coeff
+
+ ble end_loops @jumps to end
+
+ tst r6,#3 @checks (wd & 3)
+ vabs.s8 d3,d0 @vabs_s8(coeff)
+ lsl r10,r6,#1 @2*wd
+ vdup.8 d0,d3[0] @coeffabs_0
+ vdup.8 d1,d3[1] @coeffabs_1
+ vdup.8 d2,d3[2] @coeffabs_2
+ vdup.8 d3,d3[3] @coeffabs_3
+
+ bgt outer_loop_wd_2 @jumps to loop handling wd ==2
+
+ tst r4,#7 @checks ht for mul of 8
+ beq core_loop_ht_8 @when height is multiple of 8
+
+ lsl r7,r3,#2 @2*dst_strd
+ sub r9,r7,r10,lsl #1 @4*dst_strd - 4wd
+ lsl r12,r2,#1 @2*src_strd
+ sub r8,r12,r10 @2*src_strd - 2wd
+ mov r3,r3,lsl #1
+ mov r5,r10 @2wd
+
+inner_loop_ht_2: @called when wd is multiple of 4 and ht is 4,2
+
+ add r6,r0,r2 @pu1_src +src_strd
+ vld1.8 {d9},[r6],r2 @loads pu1_src
+ subs r5,r5,#8 @2wd - 8
+ vld1.8 {d5},[r0]! @loads src
+ vmull.u8 q3,d9,d1 @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ vld1.8 {d4},[r6],r2 @loads incremented src
+ vmlsl.u8 q3,d5,d0 @vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+ vld1.8 {d8},[r6],r2 @loads incremented src
+ vmlal.u8 q3,d4,d2 @vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+ vmull.u8 q2,d4,d1
+ vld1.8 {d10},[r6] @loads the incremented src
+ vmlsl.u8 q3,d8,d3
+ vmlsl.u8 q2,d9,d0
+ vmlal.u8 q2,d8,d2
+ vmlsl.u8 q2,d10,d3
+ add r6,r1,r3 @pu1_dst + dst_strd
+ vst1.8 {q3},[r1]! @stores the loaded value
+
+ vst1.8 {q2},[r6] @stores the loaded value
+
+ bgt inner_loop_ht_2 @inner loop again
+
+ subs r4,r4,#2 @ht - 2
+ add r1,r1,r9 @pu1_dst += (2*dst_strd - 2wd)
+ mov r5,r10 @2wd
+ add r0,r0,r8 @pu1_src += (2*src_strd - 2wd)
+
+ bgt inner_loop_ht_2 @loop again
+
+ b end_loops @jumps to end
+
+outer_loop_wd_2: @called when width is multiple of 2
+ lsl r5,r3,#2 @2*dst_strd
+ mov r12,r10 @2wd
+ sub r9,r5,r10,lsl #1 @4*dst_strd - 4wd
+ lsl r7,r2,#1 @2*src_strd
+ sub r8,r7,r10 @2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+ add r6,r0,r2 @pu1_src + src_strd
+ vld1.32 {d6[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+ subs r12,r12,#4 @2wd - 4
+ add r0,r0,#4 @pu1_src + 4
+ vld1.32 {d6[1]},[r6],r2 @loads pu1_src_tmp
+ vdup.32 d7,d6[1]
+ vld1.32 {d7[1]},[r6],r2 @loads pu1_src_tmp
+ vmull.u8 q2,d7,d1 @vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ vdup.32 d7,d7[1]
+ vld1.32 {d7[1]},[r6],r2
+ vmlsl.u8 q2,d6,d0
+ vmlal.u8 q2,d7,d2
+ vdup.32 d7,d7[1]
+ vld1.32 {d7[1]},[r6]
+ add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
+ vmlsl.u8 q2,d7,d3
+ vst1.32 {d4},[r1] @stores the loaded value
+ add r1,r1,#8 @pu1_dst += 4
+ vst1.32 {d5},[r6] @stores the loaded value
+
+ bgt inner_loop_wd_2 @inner loop again
+
+ @inner loop ends
+ subs r4,r4,#2 @ht - 2
+ add r1,r1,r9 @pu1_dst += 2*dst_strd - 2*wd
+ mov r12,r10 @2wd
+ add r0,r0,r8 @pu1_src += 2*src_strd - 2*wd
+
+ bgt inner_loop_wd_2 @loop again
+
+ b end_loops @jumps to end
+
+core_loop_ht_8: @when wd & ht is multiple of 8
+
+ lsl r12,r3,#3 @4*dst_strd
+ sub r8,r12,r10,lsl #1 @4*dst_strd - 2wd
+ lsl r12,r2,#2 @4*src_strd
+ sub r9,r12,r10 @4*src_strd - 2wd
+
+ bic r5,r10,#7 @r5 ->wd
+ mov r14,r10,lsr #3 @divide by 8
+ mul r12,r4,r14 @multiply height by width
+ sub r12,#4 @subtract by one for epilog
+ mov r3,r3,lsl #1
+
+prolog:
+ add r6,r0,r2 @pu1_src + src_strd
+ vld1.8 {d5},[r6],r2 @loads pu1_src
+ subs r5,r5,#8 @2wd - 8
+ vld1.8 {d4},[r0]! @loads the source
+ vld1.8 {d6},[r6],r2 @load and increment
+ vmull.u8 q15,d5,d1 @mul with coeff 1
+ vld1.8 {d7},[r6],r2 @load and increment
+ vmlsl.u8 q15,d4,d0
+ add r7,r1,r3 @pu1_dst
+ vmlal.u8 q15,d6,d2
+ vmlsl.u8 q15,d7,d3
+ vld1.8 {d8},[r6],r2 @load and increment
+
+ vmull.u8 q14,d6,d1 @mul_res 2
+ addle r0,r0,r9 @pu1_dst += 4*dst_strd - 2*wd
+ vmlsl.u8 q14,d5,d0
+ bicle r5,r10,#7 @r5 ->wd
+ vmlal.u8 q14,d7,d2
+ vld1.8 {d9},[r6],r2
+ vmlsl.u8 q14,d8,d3
+
+ vld1.8 {d10},[r6],r2
+ vmull.u8 q13,d7,d1
+ add r6,r0,r2 @pu1_src + src_strd
+ vmlsl.u8 q13,d6,d0
+ vst1.8 {q15},[r1]! @stores the loaded value
+ vmlal.u8 q13,d8,d2
+ vld1.8 {d4},[r0]! @loads the source
+ vmlsl.u8 q13,d9,d3
+
+ addle r1,r1,r8 @pu1_src += 4*src_strd - 2*wd
+ vmull.u8 q12,d8,d1
+ vld1.8 {d5},[r6],r2 @loads pu1_src
+ vmlsl.u8 q12,d7,d0
+ subs r12,r12,#4
+ vld1.8 {d6},[r6],r2 @load and increment
+ vmlal.u8 q12,d9,d2
+ vld1.8 {d7},[r6],r2 @load and increment
+ vmlsl.u8 q12,d10,d3
+ rsb r11,r2,r2,lsl #3
+ add r14,r2,r2,lsl #1
+ add r14,r14,r11
+ vst1.8 {q14},[r7],r3 @stores the loaded value
+
+ ble epilog @jumps to epilog
+
+kernel_8:
+
+ vmull.u8 q15,d5,d1 @mul with coeff 1
+ subs r5,r5,#8 @2wd - 8
+ vmlsl.u8 q15,d4,d0
+ addle r0,r0,r9 @pu1_dst += 4*dst_strd - 2*wd
+ vmlal.u8 q15,d6,d2
+ rsble r11,r2,r2,lsl #3
+ vmlsl.u8 q15,d7,d3
+ vst1.8 {q13},[r7],r3 @stores the loaded value
+
+ vld1.8 {d8},[r6],r2 @load and increment
+
+ vmull.u8 q14,d6,d1 @mul_res 2
+ bicle r5,r10,#7 @r5 ->wd
+ vmlsl.u8 q14,d5,d0
+ vst1.8 {q12},[r7],r3 @stores the loaded value
+
+ vmlal.u8 q14,d7,d2
+ vld1.8 {d9},[r6],r2
+
+ vmlsl.u8 q14,d8,d3
+ vld1.8 {d10},[r6],r2
+ add r7,r1,r3 @pu1_dst
+ vmull.u8 q13,d7,d1
+ add r6,r0,r2 @pu1_src + src_strd
+ pld [r0,r11]
+
+ vmlsl.u8 q13,d6,d0
+ vld1.8 {d4},[r0]! @loads the source
+
+ add r11,r11,r2
+ vmlal.u8 q13,d8,d2
+ vst1.8 {q15},[r1]! @stores the loaded value
+
+ vmlsl.u8 q13,d9,d3
+ vld1.8 {d5},[r6],r2 @loads pu1_src
+
+ vmull.u8 q12,d8,d1
+ vld1.8 {d6},[r6],r2 @load and increment
+ addle r1,r1,r8 @pu1_src += 4*src_strd - 2*wd
+
+ cmp r11,r14
+ rsbgt r11,r2,r2,lsl #3
+
+ vmlsl.u8 q12,d7,d0
+ subs r12,r12,#4
+
+
+ vmlal.u8 q12,d9,d2
+ vld1.8 {d7},[r6],r2 @load and increment
+
+ vmlsl.u8 q12,d10,d3
+ vst1.8 {q14},[r7],r3 @stores the loaded value
+
+ bgt kernel_8 @jumps to kernel_8
+
+epilog:
+
+ vmull.u8 q15,d5,d1 @mul with coeff 1
+ vmlsl.u8 q15,d4,d0
+ vmlal.u8 q15,d6,d2
+ vmlsl.u8 q15,d7,d3
+ vst1.8 {q13},[r7],r3 @stores the loaded value
+
+ vld1.8 {d8},[r6],r2 @load and increment
+ vmull.u8 q14,d6,d1 @mul_res 2
+ vmlsl.u8 q14,d5,d0
+ vmlal.u8 q14,d7,d2
+ vmlsl.u8 q14,d8,d3
+ vst1.8 {q12},[r7],r3 @stores the loaded value
+
+ vld1.8 {d9},[r6],r2
+ vmull.u8 q13,d7,d1
+ add r7,r1,r3 @pu1_dst
+ vmlsl.u8 q13,d6,d0
+ vst1.8 {q15},[r1]! @stores the loaded value
+ vmlal.u8 q13,d8,d2
+ vld1.8 {d10},[r6],r2
+ vmlsl.u8 q13,d9,d3
+
+ vmull.u8 q12,d8,d1
+ vst1.8 {q14},[r7],r3 @stores the loaded value
+ vmlsl.u8 q12,d7,d0
+ vmlal.u8 q12,d9,d2
+ vst1.8 {q13},[r7],r3 @stores the loaded value
+ vmlsl.u8 q12,d10,d3
+
+ vst1.8 {q12},[r7],r3 @stores the loaded value
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
diff --git a/common/arm/ihevc_inter_pred_filters_luma_horz.s b/common/arm/ihevc_inter_pred_filters_luma_horz.s
new file mode 100644
index 0000000..ee98923
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_filters_luma_horz.s
@@ -0,0 +1,536 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@* ihevc_inter_pred_luma_horz.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@*
+@* - ihevc_inter_pred_luma_horz()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+@
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction luma filter for vertical input
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
+@* assumptions : the function is optimized considering the fact width is
+@* multiple of 4 or 8. and height as multiple of 2.
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_horz (
+@ uword8 *pu1_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+
+@**************variables vs registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => *pi1_coeff
+@ r5 => ht
+@ r6 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_horz_a9q
+
+.type ihevc_inter_pred_luma_horz_a9q, %function
+
+ihevc_inter_pred_luma_horz_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ @str r1,[sp,#-4]
+ @ mov r7,#8192
+start_loop_count:
+ @ ldr r1,[sp,#-4]
+
+
+ ldr r4,[sp,#40] @loads pi1_coeff
+ ldr r8,[sp,#44] @loads ht
+ ldr r10,[sp,#48] @loads wd
+
+ vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
+ mov r11,#1
+ subs r14,r8,#0 @checks for ht == 0
+
+ vabs.s8 d2,d0 @vabs_s8(coeff)
+
+ @ble end_loops
+
+
+ vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub r12,r0,#3 @pu1_src - 3
+ vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ rsb r9,r10,r2,lsl #1 @2*src_strd - wd
+ vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ rsb r8,r10,r3,lsl #1 @2*dst_strd - wd
+ vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+ vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+ @ tst r10,#7 @checks wd for multiples
+ vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+ vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+ mov r7,r1
+
+ cmp r10,#4
+ ble outer_loop_4
+
+ cmp r10,#24
+ moveq r10,#16
+ addeq r8,#8
+ addeq r9,#8
+
+ cmp r10,#16
+ bge outer_loop_16
+
+ cmp r10,#12
+ addeq r8,#4
+ addeq r9,#4
+ b outer_loop_8
+
+
+outer_loop8_residual:
+ sub r12,r0,#3 @pu1_src - 3
+ mov r1,r7
+ mov r14,#32
+ add r1,#16
+ add r12,#16
+ mov r10,#8
+ add r8,#8
+ add r9,#8
+
+outer_loop_8:
+
+ add r6,r1,r3 @pu1_dst + dst_strd
+ add r4,r12,r2 @pu1_src + src_strd
+ subs r5,r10,#0 @checks wd
+
+ ble end_inner_loop_8
+
+inner_loop_8:
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11
+ vld1.u32 {d2},[r12],r11
+ vld1.u32 {d3},[r12],r11
+
+
+
+
+
+ @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
+ @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+ @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
+ @ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6]
+ @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
+ @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
+ @ vext.u8 d14,d12,d13,#2
+
+ @vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
+ @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
+ @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+ @vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
+ @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
+ vld1.u32 {d4},[r12],r11
+ vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {d5},[r12],r11
+ vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {d6},[r12],r11
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {d7},[r12],r11
+ vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
+ vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ vld1.u32 {d13},[r4],r11
+ vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ vld1.u32 {d14},[r4],r11
+ vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vld1.u32 {d15},[r4],r11
+ vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+ vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd
+
+ vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {d17},[r4],r11
+ vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {d18},[r4],r11
+ vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd
+ vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1
+ vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+ vst1.8 {d20},[r1]! @store the result pu1_dst
+ vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+
+ vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2
+ subs r5,r5,#8 @decrement the wd loop
+ vst1.8 {d8},[r6]! @store the result pu1_dst
+ cmp r5,#4
+ bgt inner_loop_8
+
+end_inner_loop_8:
+ subs r14,r14,#2 @decrement the ht loop
+ add r12,r12,r9 @increment the src pointer by 2*src_strd-wd
+ add r1,r1,r8 @increment the dst pointer by 2*dst_strd-wd
+ bgt outer_loop_8
+
+
+
+
+
+ ldr r10,[sp,#48] @loads wd
+ cmp r10,#12
+
+ beq outer_loop4_residual
+
+
+end_loops:
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+outer_loop_16:
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+
+ add r6,r1,r3 @pu1_dst + dst_strd
+ add r4,r12,r2 @pu1_src + src_strd
+ and r0, r12, #31
+ sub r5,r10,#0 @checks wd
+ @ble end_loops1
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ pld [r4, r2, lsl #1]
+ vld1.u32 {q1},[r12],r11
+ vld1.u32 {q2},[r12],r11
+ vld1.u32 {q3},[r12],r11
+ vld1.u32 {q6},[r12],r11
+ vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q7},[r12],r11
+ vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q8},[r12],r11
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q9},[r12],r11
+ vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+
+inner_loop_16:
+
+
+ subs r5,r5,#16
+ vmull.u8 q10,d3,d25
+
+ add r12,#8
+ vmlsl.u8 q10,d1,d24
+
+ subeq r14,r14,#2
+ vmlal.u8 q10,d7,d27
+
+ vld1.u32 {q0},[r4],r11 @vector load pu1_src
+ vmlsl.u8 q10,d5,d26
+
+ vld1.u32 {q1},[r4],r11
+ vmlal.u8 q10,d13,d28
+
+ vld1.u32 {q2},[r4],r11
+ vmlal.u8 q10,d17,d30
+
+ vld1.u32 {q3},[r4],r11
+ vmlsl.u8 q10,d15,d29
+
+ vld1.u32 {q6},[r4],r11
+ vmlsl.u8 q10,d19,d31
+
+ vld1.u32 {q7},[r4],r11
+ vqrshrun.s16 d8,q4,#6 @right shift and saturating narrow result 1
+
+ vld1.u32 {q8},[r4],r11
+ vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vld1.u32 {q9},[r4],r11
+ vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ add r4,#8
+ vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+
+ addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd
+ vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ addeq r4,r12,r2 @pu1_src + src_strd
+ vqrshrun.s16 d9,q10,#6
+
+ vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+
+@ and r7, r12, #31
+ vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+
+ vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+
+ vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+ vmull.u8 q11,d3,d25
+
+ vmlsl.u8 q11,d1,d24
+
+ vst1.8 {q4},[r1]! @store the result pu1_dst
+ vmlal.u8 q11,d7,d27
+
+ addeq r1,r1,r8
+ vqrshrun.s16 d10,q5,#6 @right shift and saturating narrow result 2
+
+@ cmp r7, r0
+ vmlsl.u8 q11,d5,d26
+
+ pld [r12, r2, lsl #2]
+ vmlal.u8 q11,d13,d28
+
+ pld [r4, r2, lsl #2]
+ vmlal.u8 q11,d17,d30
+
+@ mov r0, r7
+ vmlsl.u8 q11,d15,d29
+
+ cmp r14,#0
+ vmlsl.u8 q11,d19,d31
+
+ beq epilog_16
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ vld1.u32 {q1},[r12],r11
+ vld1.u32 {q2},[r12],r11
+ vld1.u32 {q3},[r12],r11
+ vld1.u32 {q6},[r12],r11
+ vqrshrun.s16 d11,q11,#6
+ vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q7},[r12],r11
+ vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q8},[r12],r11
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q9},[r12],r11
+ vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ cmp r5,#0
+ vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ moveq r5,r10
+ vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vst1.8 {q5},[r6]! @store the result pu1_dst
+ vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+ addeq r6,r1,r3 @pu1_dst + dst_strd
+ b inner_loop_16
+
+
+epilog_16:
+ vqrshrun.s16 d11,q11,#6
+ vst1.8 {q5},[r6]! @store the result pu1_dst
+
+ ldr r7, [sp], #4
+ ldr r0, [sp], #4
+ ldr r10,[sp,#48]
+ cmp r10,#24
+
+ beq outer_loop8_residual
+
+
+
+end_loops1:
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
+outer_loop4_residual:
+ sub r12,r0,#3 @pu1_src - 3
+ mov r1,r7
+ add r1,#8
+ mov r10,#4
+ add r12,#8
+ mov r14,#16
+ add r8,#4
+ add r9,#4
+
+outer_loop_4:
+ add r6,r1,r3 @pu1_dst + dst_strd
+ add r4,r12,r2 @pu1_src + src_strd
+
+ subs r5,r10,#0 @checks wd
+ ble end_inner_loop_4
+
+inner_loop_4:
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11
+ vld1.u32 {d2},[r12],r11
+ vld1.u32 {d3},[r12],r11
+ vld1.u32 {d4},[r12],r11
+ vld1.u32 {d5},[r12],r11
+ vld1.u32 {d6},[r12],r11
+ vld1.u32 {d7},[r12],r11
+ @add r12,r12,#4 @increment the input pointer
+ sub r12,r12,#4
+ @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
+ @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+
+ @vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
+ @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+ @vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
+ @vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
+ vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
+ vld1.u32 {d13},[r4],r11
+ vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register
+ vld1.u32 {d14},[r4],r11
+ vzip.32 d1,d13
+ vld1.u32 {d15},[r4],r11
+ vzip.32 d2,d14
+ vld1.u32 {d16},[r4],r11
+ vzip.32 d3,d15
+ vld1.u32 {d17},[r4],r11
+ vzip.32 d4,d16
+ vld1.u32 {d18},[r4],r11
+ vzip.32 d5,d17
+ vld1.u32 {d19},[r4],r11
+ sub r4,r4,#4
+ @ add r4,r4,#4 @increment the input pointer
+ @ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
+ @ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
+ @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
+ @ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+ @ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
+ @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
+
+
+
+
+
+
+
+ vzip.32 d6,d18
+ vzip.32 d7,d19
+
+ vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
+ vmlsl.u8 q4,d0,d24
+ vmlsl.u8 q4,d2,d26
+ vmlal.u8 q4,d3,d27
+ vmlal.u8 q4,d4,d28
+ vmlsl.u8 q4,d5,d29
+ vmlal.u8 q4,d6,d30
+ vmlsl.u8 q4,d7,d31
+
+ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result
+ vst1.32 {d8[0]},[r1]! @store the i iteration result which is in upper part of the register
+ vst1.32 {d8[1]},[r6]! @store the ii iteration result which is in lower part of the register
+ subs r5,r5,#4 @decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4:
+ subs r14,r14,#2 @decrement the ht by 4
+ add r12,r12,r9 @increment the input pointer 2*src_strd-wd
+ add r1,r1,r8 @increment the output pointer 2*dst_strd-wd
+ bgt outer_loop_4
+ @subs r7,r7,#1
+ @ bgt start_loop_count
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert.s b/common/arm/ihevc_inter_pred_filters_luma_vert.s
new file mode 100644
index 0000000..04942ae
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert.s
@@ -0,0 +1,947 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@* ihevc_inter_pred_filters_luma_vert.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@*
+@* - ihevc_inter_pred_luma_vert()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction luma filter for vertical input
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
+@* assumptions : the function is optimized considering the fact width is
+@* multiple of 4 or 8. and height as multiple of 2.
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert (
+@ uword8 *pu1_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+
+@**************variables vs registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r6 => dst_strd
+@ r12 => *pi1_coeff
+@ r5 => ht
+@ r3 => wd
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_vert_a9q
+
+.type ihevc_inter_pred_luma_vert_a9q, %function
+
+ihevc_inter_pred_luma_vert_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r12,[sp,#40] @load pi1_coeff
+ mov r6,r3
+ ldr r5,[sp,#48] @load wd
+ vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
+ sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
+ vabs.s8 d0,d0 @vabs_s8(coeff)
+ add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
+ ldr r3,[sp,#44] @load ht
+ subs r7,r3,#0 @r3->ht
+ @ble end_loops @end loop jump
+ vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+ cmp r5,#8
+ vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+ vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+ vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+ vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+ vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+ vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+ vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+ blt core_loop_wd_4 @core loop wd 4 jump
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+
+ bic r4,r5,#7 @r5 ->wd
+ rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd
+ rsb r8,r4,r2,lsl #2 @r2->src_strd
+ mov r3, r5, lsr #3 @divide by 8
+ mul r7, r3 @multiply height by width
+ sub r7, #4 @subtract by one for epilog
+
+prolog:
+
+ and r10, r0, #31
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ subs r4,r4,#8
+ vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+
+ vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+ addle r0,r0,r8
+ vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+ bicle r4,r5,#7 @r5 ->wd
+ vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+ pld [r3]
+ vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ pld [r3, r2]
+ vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+
+ add r3, r3, r2
+ vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q6,d3,d23
+ vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q6,d2,d22
+ vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q6,d4,d24
+ vmlal.u8 q6,d5,d25
+ vmlal.u8 q6,d6,d26
+ vmlsl.u8 q6,d7,d27
+ vmlal.u8 q6,d16,d28
+ vmlsl.u8 q6,d17,d29
+ add r14,r1,r6
+ vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@
+ vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ addle r1,r1,r9
+
+ vmull.u8 q7,d4,d23
+ subs r7,r7,#4
+ vmlsl.u8 q7,d3,d22
+ vmlsl.u8 q7,d5,d24
+ vmlal.u8 q7,d6,d25
+ vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d7,d26
+ vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d16,d27
+ vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d17,d28
+ vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d18,d29
+ vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+ vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+ vqrshrun.s16 d12,q6,#6
+
+
+ blt epilog_end @jumps to epilog_end
+ beq epilog @jumps to epilog
+
+kernel_8:
+
+ subs r4,r4,#8
+ vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+
+ addle r0,r0,r8
+ vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+
+ bicle r4,r5,#7 @r5 ->wd
+ vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+
+ vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+
+ vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+
+ vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+
+ vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+
+ vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+ vst1.8 {d12},[r14],r6
+
+@ and r11, r0, #31
+ vqrshrun.s16 d14,q7,#6
+
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+ vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+ vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+ vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+
+ vst1.8 {d14},[r14],r6
+ vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+
+ add r14,r1,#0
+ vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+
+ add r1, r1, #8
+ vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+
+ vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+ addle r1,r1,r9
+ vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+@ cmp r11, r10
+ vmull.u8 q6,d3,d23
+
+ add r10, r3, r2, lsl #3 @ 10*strd - 8+2
+ vmlsl.u8 q6,d2,d22
+
+ add r10, r10, r2 @ 11*strd
+ vmlsl.u8 q6,d4,d24
+
+ vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q6,d5,d25
+
+ vmlal.u8 q6,d6,d26
+ vst1.8 {d8},[r14],r6 @vst1_u8(pu1_dst,sto_res)@
+
+ pld [r10] @11+ 0
+ vmlsl.u8 q6,d7,d27
+
+ pld [r10, r2] @11+ 1*strd
+ vmlal.u8 q6,d16,d28
+
+ pld [r10, r2, lsl #1] @11+ 2*strd
+ vmlsl.u8 q6,d17,d29
+
+ add r10, r10, r2 @12*strd
+ vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ pld [r10, r2, lsl #1] @11+ 3*strd
+ vmull.u8 q7,d4,d23
+
+@ mov r10, r11
+ vmlsl.u8 q7,d3,d22
+
+ subs r7,r7,#4
+ vmlsl.u8 q7,d5,d24
+
+ vmlal.u8 q7,d6,d25
+ vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d7,d26
+ vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d16,d27
+ vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d17,d28
+ vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d18,d29
+ vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+ vqrshrun.s16 d12,q6,#6
+ vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+
+
+
+ bgt kernel_8 @jumps to kernel_8
+
+epilog:
+
+ vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+ vst1.8 {d12},[r14],r6
+
+ vqrshrun.s16 d14,q7,#6
+
+ vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+ vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+ vst1.8 {d14},[r14],r6
+
+ vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q6,d3,d23
+ vmlsl.u8 q6,d2,d22
+ vmlsl.u8 q6,d4,d24
+ vmlal.u8 q6,d5,d25
+ vmlal.u8 q6,d6,d26
+ vmlsl.u8 q6,d7,d27
+ vmlal.u8 q6,d16,d28
+ vmlsl.u8 q6,d17,d29
+ add r14,r1,r6
+ vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@
+ vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q7,d4,d23
+ vmlsl.u8 q7,d3,d22
+ vmlsl.u8 q7,d5,d24
+ vmlal.u8 q7,d6,d25
+ vmlal.u8 q7,d7,d26
+ vmlsl.u8 q7,d16,d27
+ vmlal.u8 q7,d17,d28
+ vmlsl.u8 q7,d18,d29
+
+ vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+ vqrshrun.s16 d12,q6,#6
+
+epilog_end:
+ vst1.8 {d12},[r14],r6
+ vqrshrun.s16 d14,q7,#6
+
+ vst1.8 {d14},[r14],r6
+
+
+end_loops:
+ tst r5,#7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+
+ ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+ @
+
+core_loop_wd_4:
+ rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd
+ rsb r8,r5,r2,lsl #2 @r2->src_strd
+ vmov.i8 d4,#0
+
+outer_loop_wd_4:
+ subs r12,r5,#0
+ ble end_inner_loop_wd_4 @outer loop jump
+
+inner_loop_wd_4:
+ add r3,r0,r2
+ vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+ subs r12,r12,#4
+ vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+ vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+ vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
+ vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
+
+ vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+ add r0,r0,#4
+ vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+ vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
+
+ vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+ vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+ vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
+
+ vmull.u8 q4,d7,d23
+ vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
+ vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
+ vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+ vmlsl.u8 q4,d6,d22
+ vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
+
+ vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+ vmlsl.u8 q4,d4,d24
+ vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+ vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
+
+ vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+ vmlal.u8 q4,d5,d25
+ vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+ vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
+
+ vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+ vmlal.u8 q4,d6,d26
+ vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+ vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
+
+ vdup.u32 d4,d7[1]
+ vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
+
+ vmlsl.u8 q4,d7,d27
+ vld1.u32 {d4[1]},[r3],r2
+ vmlal.u8 q4,d4,d28
+ vdup.u32 d5,d4[1]
+ vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.u32 {d5[1]},[r3]
+ add r3,r1,r6
+ vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
+
+ vmlsl.u8 q4,d5,d29
+ vst1.32 {d0[1]},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
+ vqrshrun.s16 d8,q4,#6
+
+ vst1.32 {d8[0]},[r3],r6
+ add r1,r1,#4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r7,r7,#4
+ add r1,r1,r9
+ add r0,r0,r8
+ bgt outer_loop_wd_4
+
+ ldmfd sp!, {r4-r12, r15} @reload the registers from sp
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction luma filter for vertical 16bit output
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' no downshifting or clipping is done and the output is used as
+@* an input for weighted prediction assumptions : the function is optimized
+@* considering the fact width is multiple of 4 or 8. and height as multiple
+@* of 2.
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@* word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+
+@**************variables vs registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r6 => dst_strd
+@ r12 => *pi1_coeff
+@ r5 => ht
+@ r3 => wd
+
+
+
+.globl ihevc_inter_pred_luma_vert_w16out_a9q
+
+.type ihevc_inter_pred_luma_vert_w16out_a9q, %function
+
+ihevc_inter_pred_luma_vert_w16out_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r12,[sp,#40] @load pi1_coeff
+ mov r6,r3
+ ldr r5,[sp,#48] @load wd
+ vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
+ sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
+ vabs.s8 d0,d0 @vabs_s8(coeff)
+ add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
+ ldr r3,[sp,#44] @load ht
+ subs r7,r3,#0 @r3->ht
+ @ble end_loops_16out @end loop jump
+ vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+ cmp r5,#8
+ vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+ vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+ vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+ vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+ vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+ vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+ vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+ blt core_loop_wd_4_16out @core loop wd 4 jump
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+
+ bic r4,r5,#7 @r5 ->wd
+ rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd
+ rsb r8,r4,r2,lsl #2 @r2->src_strd
+ mov r6, r6, lsl #1
+ mov r3, r5, lsr #3 @divide by 8
+ mul r7, r3 @multiply height by width
+ sub r7, #4 @subtract by one for epilog
+
+prolog_16out:
+
+ and r10, r0, #31
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+
+ vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ subs r4,r4,#8
+ vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+
+ addle r0,r0,r8
+ vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+ bicle r4,r5,#7 @r5 ->wd
+ vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+ vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+ pld [r3]
+ vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ pld [r3, r2]
+ vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+ add r3, r3, r2
+ vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vmull.u8 q6,d3,d23
+ vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q6,d2,d22
+ vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q6,d4,d24
+ vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q6,d5,d25
+ vmlal.u8 q6,d6,d26
+ vmlsl.u8 q6,d7,d27
+ vmlal.u8 q6,d16,d28
+ vmlsl.u8 q6,d17,d29
+ add r14,r1,r6
+ vst1.8 {d8, d9},[r1]! @vst1_u8(pu1_dst,sto_res)@
+ @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ addle r1,r1,r9,lsl #1
+
+ vmull.u8 q7,d4,d23
+ subs r7,r7,#4
+ vmlsl.u8 q7,d3,d22
+ vmlsl.u8 q7,d5,d24
+ vmlal.u8 q7,d6,d25
+ vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d7,d26
+ vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d16,d27
+ vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d17,d28
+ vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d18,d29
+ vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+ vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+ @vqrshrun.s16 d12,q6,#6
+
+
+ blt epilog_end_16out
+ beq epilog_16out @jumps to epilog
+
+kernel_8_16out:
+
+ subs r4,r4,#8
+ vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+
+ addle r0,r0,r8
+ vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+
+ vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+
+ vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+
+ bicle r4,r5,#7 @r5 ->wd
+ vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+
+ vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+
+ vst1.8 {d12,d13},[r14],r6
+ vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+
+@ and r11, r0, #31
+ vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+
+ vst1.8 {d14,d15},[r14],r6
+ vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+
+ add r14,r1,r6
+ vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+
+ vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+
+ vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+
+ vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@
+ vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+
+ addle r1,r1,r9,lsl #1
+ vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+
+@ cmp r11, r10
+ vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+ add r10, r3, r2, lsl #3 @ 10*strd - 8+2
+ vmull.u8 q6,d3,d23
+
+ add r10, r10, r2 @ 11*strd
+ vmlsl.u8 q6,d2,d22
+
+ pld [r10] @11+ 0
+ vmlsl.u8 q6,d4,d24
+
+ pld [r10, r2] @11+ 1*strd
+ vmlal.u8 q6,d5,d25
+
+ pld [r10, r2, lsl #1] @11+ 2*strd
+ vmlal.u8 q6,d6,d26
+
+ add r10, r10, r2 @12*strd
+ vmlsl.u8 q6,d7,d27
+
+ pld [r10, r2, lsl #1] @11+ 3*strd
+ vmlal.u8 q6,d16,d28
+
+@ mov r10, r11
+ vmlsl.u8 q6,d17,d29
+
+ vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q7,d4,d23
+
+ subs r7,r7,#4
+ vmlsl.u8 q7,d3,d22
+
+ vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+ vmlsl.u8 q7,d5,d24
+
+ vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d6,d25
+
+ vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d7,d26
+
+ vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d16,d27
+
+ vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.u8 q7,d17,d28
+
+ vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlsl.u8 q7,d18,d29
+
+
+ bgt kernel_8_16out @jumps to kernel_8
+
+epilog_16out:
+
+ vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
+ vst1.8 {d12,d13},[r14],r6
+
+ @vqrshrun.s16 d14,q7,#6
+
+ vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
+ vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
+ vst1.8 {d14,d15},[r14],r6
+
+ @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q6,d3,d23
+ vmlsl.u8 q6,d2,d22
+ vmlsl.u8 q6,d4,d24
+ vmlal.u8 q6,d5,d25
+ vmlal.u8 q6,d6,d26
+ vmlsl.u8 q6,d7,d27
+ vmlal.u8 q6,d16,d28
+ vmlsl.u8 q6,d17,d29
+ add r14,r1,r6
+ vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@
+ @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.u8 q7,d4,d23
+ vmlsl.u8 q7,d3,d22
+ vmlsl.u8 q7,d5,d24
+ vmlal.u8 q7,d6,d25
+ vmlal.u8 q7,d7,d26
+ vmlsl.u8 q7,d16,d27
+ vmlal.u8 q7,d17,d28
+ vmlsl.u8 q7,d18,d29
+
+ vst1.8 {d10,d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+ @vqrshrun.s16 d12,q6,#6
+
+epilog_end_16out:
+ vst1.8 {d12,d13},[r14],r6
+ @vqrshrun.s16 d14,q7,#6
+
+ vst1.8 {d14,d15},[r14],r6
+
+
+end_loops_16out:
+ tst r5,#7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+
+ ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #16
+ mov r7, #16
+ mov r6, r6, lsr #1
+
+ @
+
+core_loop_wd_4_16out:
+ rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd
+ rsb r8,r5,r2,lsl #2 @r2->src_strd
+ vmov.i8 d4,#0
+ mov r6, r6, lsl #1
+
+outer_loop_wd_4_16out:
+ subs r12,r5,#0
+ ble end_inner_loop_wd_4_16out @outer loop jump
+
+inner_loop_wd_4_16out:
+ add r3,r0,r2
+ vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+ subs r12,r12,#4
+ vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+ vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+ vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
+ vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
+
+ vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+ add r0,r0,#4
+ vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+ vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
+
+ vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+ vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+ vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
+
+ vmull.u8 q4,d7,d23
+ vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
+ vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
+ vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
+ vmlsl.u8 q4,d6,d22
+ vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
+
+ vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
+ vmlsl.u8 q4,d4,d24
+ vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
+ vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
+
+ vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
+ vmlal.u8 q4,d5,d25
+ vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
+ vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
+
+ vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
+ vmlal.u8 q4,d6,d26
+ vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
+ vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
+
+ vdup.u32 d4,d7[1]
+ vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
+
+ vmlsl.u8 q4,d7,d27
+ vld1.u32 {d4[1]},[r3],r2
+ vmlal.u8 q4,d4,d28
+ vdup.u32 d5,d4[1]
+ @vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.u32 {d5[1]},[r3]
+ add r3,r1,r6
+ vst1.32 {d0},[r1]! @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
+
+ vmlsl.u8 q4,d5,d29
+ vst1.32 {d1},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
+ @vqrshrun.s16 d8,q4,#6
+
+ vst1.32 {d8},[r3],r6
+ @add r1,r1,#4
+ vst1.32 {d9},[r3]
+ bgt inner_loop_wd_4_16out
+
+end_inner_loop_wd_4_16out:
+ subs r7,r7,#4
+ add r1,r1,r9,lsl #1
+ add r0,r0,r8
+ bgt outer_loop_wd_4_16out
+
+ ldmfd sp!, {r4-r12, r15} @reload the registers from sp
+
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
new file mode 100644
index 0000000..4fbc5d1
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
@@ -0,0 +1,393 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@* ihevc_inter_pred_filters_luma_vert_w16inp.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@* - ihevc_inter_pred_filters_luma_vert_w16inp()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+@
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma vertical filter for 16bit input.
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
+@* clipped to lie between 0 and 255 assumptions : the function is
+@* optimized considering the fact width is multiple of 4. and height as
+@* multiple of 2.
+@*
+@* @param[in] pi2_src
+@* word16 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_vert_w16inp_a9q
+
+.type ihevc_inter_pred_luma_vert_w16inp_a9q, %function
+
+ihevc_inter_pred_luma_vert_w16inp_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r12,[sp,#40] @load pi1_coeff
+ mov r6,r3
+ ldr r5,[sp,#48] @load wd
+ vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
+ mov r2, r2, lsl #1
+ sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
+ @vabs.s8 d0,d0 @vabs_s8(coeff)
+ add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
+ ldr r3,[sp,#44] @load ht
+ subs r7,r3,#0 @r3->ht
+ @ble end_loops @end loop jump
+ vmovl.s8 q0,d0
+ vdup.16 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+ vdup.16 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+ vdup.16 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+ vdup.16 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+ vdup.16 d26,d1[0] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+ vdup.16 d27,d1[1] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+ vdup.16 d28,d1[2] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+ vdup.16 d29,d1[3] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+
+ rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd
+ rsb r8,r5,r2,lsl #2 @r2->src_strd
+ sub r8,r8,r5
+ mov r3, r5, lsr #2 @divide by 4
+ mul r7, r3 @multiply height by width
+ sub r7, #4 @subtract by one for epilog
+ mov r4,r5 @r5 ->wd
+ @mov r2, r2, lsl #1
+
+prolog:
+
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ subs r4,r4,#4
+ vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+ vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+ vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ addle r0,r0,r8,lsl #0
+ vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+ movle r4,r5 @r5 ->wd
+ vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+ vqshrn.s32 d8, q4, #6
+
+ vld1.16 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q6,d3,d23
+ vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q6,d2,d22
+ vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q6,d4,d24
+ vmlal.s16 q6,d5,d25
+ vmlal.s16 q6,d6,d26
+ vmlal.s16 q6,d7,d27
+ vmlal.s16 q6,d16,d28
+ vmlal.s16 q6,d17,d29
+ add r14,r1,r6
+ vqshrn.s32 d10, q5, #6
+ vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vmull.s16 q7,d4,d23
+ vmlal.s16 q7,d3,d22
+ vmlal.s16 q7,d5,d24
+ vmlal.s16 q7,d6,d25
+ vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d7,d26
+ vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d16,d27
+ vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d17,d28
+ vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d18,d29
+ vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+ vst1.32 {d8[0]},[r1]! @vst1_u8(pu1_dst,sto_res)@
+ vqshrn.s32 d12, q6, #6
+ vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ addle r1,r1,r9
+
+ subs r7,r7,#4
+
+ blt epilog_end @jumps to epilog_end
+ beq epilog @jumps to epilog
+
+kernel_8:
+
+ vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ subs r4,r4,#4
+ vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+ addle r0,r0,r8,lsl #0
+ vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+ vst1.32 {d10[0]},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+
+ vqshrn.s32 d14, q7, #6
+ vqrshrun.s16 d12,q6,#6
+ vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+ vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+ vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vst1.32 {d12[0]},[r14],r6
+
+ vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+ vqshrn.s32 d8, q4, #6
+ vqrshrun.s16 d14,q7,#6
+
+ vmull.s16 q6,d3,d23
+ movle r4,r5 @r5 ->wd
+
+ vmlal.s16 q6,d2,d22
+ vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q6,d4,d24
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+
+ vmlal.s16 q6,d5,d25
+
+ vmlal.s16 q6,d6,d26
+ vst1.32 {d14[0]},[r14],r6
+
+ vmlal.s16 q6,d7,d27
+ vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q6,d16,d28
+ add r14,r1,r6
+
+ vmlal.s16 q6,d17,d29
+ vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+ vqshrn.s32 d10, q5, #6
+ vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+ vmull.s16 q7,d4,d23
+ vmlal.s16 q7,d3,d22
+ vmlal.s16 q7,d5,d24
+ vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q7,d6,d25
+ vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d7,d26
+ vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d16,d27
+ vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d17,d28
+ vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d18,d29
+ vst1.32 {d8[0]},[r1]! @vst1_u8(pu1_dst,sto_res)@
+
+ vqshrn.s32 d12, q6, #6
+ addle r1,r1,r9
+
+ vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ subs r7,r7,#4
+
+ bgt kernel_8 @jumps to kernel_8
+
+epilog:
+
+ vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+ vst1.32 {d10[0]},[r14],r6
+
+ vqshrn.s32 d14, q7, #6
+ vqrshrun.s16 d12,q6,#6
+
+ vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+ vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+ vst1.32 {d12[0]},[r14],r6
+
+ vqshrn.s32 d8, q4, #6
+ vqrshrun.s16 d14,q7,#6
+
+ vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q6,d3,d23
+ vmlal.s16 q6,d2,d22
+ vmlal.s16 q6,d4,d24
+ vmlal.s16 q6,d5,d25
+ vmlal.s16 q6,d6,d26
+ vmlal.s16 q6,d7,d27
+ vmlal.s16 q6,d16,d28
+ vmlal.s16 q6,d17,d29
+ vst1.32 {d14[0]},[r14],r6
+ vqshrn.s32 d10, q5, #6
+ vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q7,d4,d23
+ vmlal.s16 q7,d3,d22
+ vmlal.s16 q7,d5,d24
+ vmlal.s16 q7,d6,d25
+ vmlal.s16 q7,d7,d26
+ vmlal.s16 q7,d16,d27
+ vmlal.s16 q7,d17,d28
+ vmlal.s16 q7,d18,d29
+ vqshrn.s32 d12, q6, #6
+ vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ add r14,r1,r6
+ vst1.32 {d8[0]},[r1]! @vst1_u8(pu1_dst,sto_res)@
+
+epilog_end:
+ vst1.32 {d10[0]},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+ vqrshrun.s16 d12,q6,#6
+
+ vst1.32 {d12[0]},[r14],r6
+ vqshrn.s32 d14, q7, #6
+ vqrshrun.s16 d14,q7,#6
+
+ vst1.32 {d14[0]},[r14],r6
+
+
+end_loops:
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_luma_copy.s b/common/arm/ihevc_inter_pred_luma_copy.s
new file mode 100644
index 0000000..8a61369
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_copy.s
@@ -0,0 +1,188 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction luma function for copy
+@*
+@* @par description:
+@* copies the array of width 'wd' and height 'ht' from the location pointed
+@* by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_inter_pred_luma_copy (
+@ uword8 *pu1_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+
+@**************variables vs registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r7 => ht
+@ r12 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_copy_a9q
+
+.type ihevc_inter_pred_luma_copy_a9q, %function
+
+ihevc_inter_pred_luma_copy_a9q:
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r12,[sp,#48] @loads wd
+ ldr r7,[sp,#44] @loads ht
+ cmp r7,#0 @checks ht == 0
+ ble end_loops
+ tst r12,#15 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_16
+ tst r12,#7 @checks wd for multiples for 4 & 8
+ beq core_loop_wd_8
+ sub r11,r12,#4
+
+outer_loop_wd_4:
+ subs r4,r12,#0 @checks wd == 0
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r0,r0,#4 @pu1_src += 4
+ vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ subs r4,r4,#4 @(wd -4)
+ vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add r1,r1,#4 @pu1_dst += 4
+ vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r7,r7,#4 @ht - 4
+ sub r0,r5,r11 @pu1_src = pu1_src_tmp
+ sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_4
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+core_loop_wd_8:
+ sub r11,r12,#8
+
+outer_loop_wd_8:
+ subs r4,r12,#0 @checks wd
+ ble end_inner_loop_wd_8
+
+inner_loop_wd_8:
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp)
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4,r4,#8 @wd - 8(loop condition)
+ vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_8
+
+end_inner_loop_wd_8:
+ subs r7,r7,#4 @ht -= 4
+ sub r0,r5,r11 @pu1_src = pu1_src_tmp
+ sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_8
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+core_loop_wd_16:
+ sub r11,r12,#16
+
+outer_loop_wd_16:
+ subs r4,r12,#0 @checks wd
+ ble end_inner_loop_wd_16
+
+inner_loop_wd_16:
+ add r5,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp)
+ add r6,r1,r3 @pu1_dst_tmp += dst_strd
+ vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ subs r4,r4,#16 @wd - 8(loop condition)
+ vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_16
+
+end_inner_loop_wd_16:
+ subs r7,r7,#4 @ht -= 4
+ sub r0,r5,r11 @pu1_src = pu1_src_tmp
+ sub r1,r6,r11 @pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_16
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_luma_copy_w16out.s b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
new file mode 100644
index 0000000..771bcb3
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_copy_w16out.s
@@ -0,0 +1,249 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction luma function for copy
+@*
+@* @par description:
+@* copies the array of width 'wd' and height 'ht' from the location pointed
+@* by 'src' to the location pointed by 'dst'
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_copy_w16out (
+@ uword8 *pu1_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+
+@**************variables vs registers*****************************************
+@ r0 => *pu1_src
+@ r1 => *pi2_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r7 => ht
+@ r12 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_copy_w16out_a9q
+
+.type ihevc_inter_pred_luma_copy_w16out_a9q, %function
+
+ihevc_inter_pred_luma_copy_w16out_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r12,[sp,#48] @loads wd
+ ldr r7,[sp,#44] @loads ht
+ cmp r7,#0 @ht condition(ht == 0)
+ ble end_loops @loop
+ tst r12,#7 @conditional check for wd (multiples)
+ beq core_loop_wd_8
+ sub r11,r12,#4
+ lsls r6,r3,#1
+
+outer_loop_wd_4:
+ subs r4,r12,#0 @wd conditional subtract
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp)
+ add r5,r0,r2 @pu1_src +src_strd
+ vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ add r10,r1,r6
+ subs r4,r4,#4 @wd - 4
+ vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6)
+ vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp)
+ add r0,r0,#4 @pu1_src += 4
+ vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ add r1,r1,#8
+ vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6)
+ vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6)
+ vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp)
+ vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6)
+ vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs r7,r7,#4 @ht + 4
+ sub r0,r5,r11
+ sub r1,r10,r11,lsl #1
+ bgt outer_loop_wd_4
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+core_loop_wd_8:
+ @sub r11,r12,#8
+ lsls r5,r3,#1
+ rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width
+ rsb r8,r12,r2,lsl #2 @r2->src_strd
+ mov r4,r12, lsr #3 @ divide by 8
+ mul r7, r4
+ sub r4,r12,#0 @wd conditional check
+ sub r7,r7,#4 @subtract one for epilog
+
+prolog:
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+ add r10,r1,r5
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ subs r4,r4,#8 @wd decrements by 8
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+ addle r0,r0,r8
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+ addle r1,r1,r11,lsl #1
+ suble r4,r12,#0 @wd conditional check
+
+ subs r7,r7,#4 @ht - 4
+
+ blt epilog_end @jumps to epilog_end
+ beq epilog @jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ subs r4,r4,#8 @wd decrements by 8
+ addle r0,r0,r8
+
+ add r6,r0,r2 @pu1_src_tmp += src_strd
+
+ vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp)
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+
+ vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp)
+ add r10,r1,r5
+
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+
+ addle r1,r1,r11,lsl #1
+ suble r4,r12,#0 @wd conditional check
+
+ subs r7,r7,#4 @ht - 4
+ bgt outer_loop_wd_8
+
+epilog:
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp)
+ @add r6,r0,r2 @pu1_src_tmp += src_strd
+
+ vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6)
+ vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6)
+ add r10,r1,r5
+ vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6)
+
+ vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+ vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+ vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp)
+
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_luma_horz_w16out.s b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
new file mode 100644
index 0000000..b27b2e8
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_horz_w16out.s
@@ -0,0 +1,603 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+
+@/**
+@******************************************************************************
+@* @file
+@* ihevc_inter_pred_luma_horz_w16out.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@*
+@* - ihevc_inter_pred_luma_horz_w16out()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* interprediction luma filter for horizontal 16bit output
+@*
+@* @par description:
+@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+@* to the elements pointed by 'pu1_src' and writes to the location pointed
+@* by 'pu1_dst' no downshifting or clipping is done and the output is used
+@* as an input for vertical filtering or weighted prediction assumptions :
+@* the function is optimized considering the fact width is multiple of 4 or
+@* 8. if width is multiple of 4 then height should be multiple of 2, width 8
+@* is optimized further.
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pi2_dst
+@* word16 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
+@ word16 *pi2_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd
+
+
+@r0 - free
+@r1 - dst_ptr
+@r2 - src_strd
+@r3 - dst_strd
+@r4 - src_ptr2
+@r5 - inner loop counter
+@r6 - dst_ptr2
+@r7 - free
+@r8 - dst_strd2
+@r9 - src_strd1
+@r10 - wd
+@r11 - #1
+@r12 - src_ptr1
+@r14 - loop_counter
+.text
+.align 4
+
+
+
+
+
+.globl ihevc_inter_pred_luma_horz_w16out_a9q
+
+.type ihevc_inter_pred_luma_horz_w16out_a9q, %function
+
+ihevc_inter_pred_luma_horz_w16out_a9q:
+
+ bic r14, #1 @ clearing bit[0], so that it goes back to mode
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r4,[sp,#40] @loads pi1_coeff
+ ldr r7,[sp,#44] @loads ht
+
+
+ vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff)
+ sub r14,r7,#0 @checks for ht == 0
+ vabs.s8 d2,d0 @vabs_s8(coeff)
+ mov r11,#1
+ @ble end_loops
+ ldr r10,[sp,#48] @loads wd
+ vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub r12,r0,#3 @pu1_src - 3
+ vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ rsb r9,r10,r2,lsl #1 @2*src_strd - wd
+ vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ rsb r8,r10,r3 @dst_strd - wd
+ vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+ vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+ and r7,r14,#1 @calculating ht_residue ht_residue = (ht & 1)
+ vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+ sub r14,r14,r7 @decrement height by ht_residue(residue value is calculated outside)
+ vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+ cmp r7,#1
+ beq odd_height_decision
+
+even_height_decision:
+ mov r7,r1
+ cmp r10,#4
+ ble outer_loop_4
+
+ cmp r10,#24
+ moveq r10,#16
+ addeq r8,#8
+ addeq r9,#8
+
+ cmp r10,#16
+ bge outer_loop_16_branch
+
+ cmp r10,#12
+ addeq r8,#4
+ addeq r9,#4
+outer_loop_8_branch:
+ b outer_loop_8
+
+outer_loop_16_branch:
+ b outer_loop_16
+
+
+odd_height_decision:
+ cmp r10,#24
+ beq outer_loop_8_branch
+ cmp r10,#12
+ beq outer_loop_4
+ b even_height_decision
+
+outer_loop4_residual:
+ sub r12,r0,#3 @pu1_src - 3
+ mov r1,r7
+ add r1,#16
+ mov r10,#4
+ add r12,#8
+ mov r14,#16
+ add r8,#4
+ add r9,#4
+
+outer_loop_4:
+ add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
+ add r4,r12,r2 @pu1_src + src_strd
+
+ subs r5,r10,#0 @checks wd
+ ble end_inner_loop_4
+
+inner_loop_4:
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11
+ vld1.u32 {d2},[r12],r11
+ vld1.u32 {d3},[r12],r11
+ vld1.u32 {d4},[r12],r11
+ vld1.u32 {d5},[r12],r11
+ vld1.u32 {d6},[r12],r11
+ vld1.u32 {d7},[r12],r11
+ @add r12,r12,#4 @increment the input pointer
+ sub r12,r12,#4
+ @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
+ @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+
+ @vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
+ @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+ @vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
+ @vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
+ vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
+ vld1.u32 {d13},[r4],r11
+ vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register
+ vld1.u32 {d14},[r4],r11
+ vzip.32 d1,d13
+ vld1.u32 {d15},[r4],r11
+ vzip.32 d2,d14
+ vld1.u32 {d16},[r4],r11
+ vzip.32 d3,d15
+ vld1.u32 {d17},[r4],r11
+ vzip.32 d4,d16
+ vld1.u32 {d18},[r4],r11
+ vzip.32 d5,d17
+ vld1.u32 {d19},[r4],r11
+ sub r4,r4,#4
+ @ add r4,r4,#4 @increment the input pointer
+ @ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2]
+ @ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
+ @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
+ @ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+ @ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
+ @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
+
+
+
+
+
+
+
+ vzip.32 d6,d18
+ vzip.32 d7,d19
+
+ vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
+ vmlsl.u8 q4,d0,d24
+ vmlsl.u8 q4,d2,d26
+ vmlal.u8 q4,d3,d27
+ vmlal.u8 q4,d4,d28
+ vmlsl.u8 q4,d5,d29
+ vmlal.u8 q4,d6,d30
+ vmlsl.u8 q4,d7,d31
+
+ @ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result
+ vst1.64 {d8},[r1]! @store the i iteration result which is in upper part of the register
+ vst1.64 {d9},[r6]! @store the ii iteration result which is in lower part of the register
+ subs r5,r5,#4 @decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4:
+ subs r14,r14,#2 @decrement the ht by 4
+ add r12,r12,r9 @increment the input pointer 2*src_strd-wd
+ add r1,r6,r8,lsl #1 @increment the output pointer 2*dst_strd-wd
+ bgt outer_loop_4
+
+
+height_residue_4:
+
+ ldr r7,[sp,#44] @loads ht
+ and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1)
+ cmp r7,#0
+ @beq end_loops
+ ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
+
+outer_loop_height_residue_4:
+
+
+ subs r5,r10,#0 @checks wd
+ ble end_inner_loop_height_residue_4
+
+inner_loop_height_residue_4:
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11
+
+
+
+
+
+
+ @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
+ @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+
+
+
+ @add r12,r12,#4 @increment the input pointer
+ @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
+ @ vext.u8 d6,d0,d1,#6 @vector extract of src[0_6]
+ @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
+ @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
+ vld1.u32 {d2},[r12],r11
+ vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time
+ vld1.u32 {d3},[r12],r11
+ vmlsl.u8 q4,d0,d24
+ vld1.u32 {d4},[r12],r11
+ vmlsl.u8 q4,d2,d26
+ vld1.u32 {d5},[r12],r11
+ vmlal.u8 q4,d3,d27
+ vld1.u32 {d6},[r12],r11
+ vmlal.u8 q4,d4,d28
+ vld1.u32 {d7},[r12],r11
+ vmlsl.u8 q4,d5,d29
+ sub r12,r12,#4
+ vmlal.u8 q4,d6,d30
+ vmlsl.u8 q4,d7,d31 @store the i iteration result which is in upper part of the register
+ subs r5,r5,#4 @decrement the wd by 4
+ vst1.64 {d8},[r1]!
+ bgt inner_loop_height_residue_4
+
+end_inner_loop_height_residue_4:
+ subs r7,r7,#1 @decrement the ht by 4
+ rsb r9,r10,r2
+ add r12,r12,r9 @increment the input pointer src_strd-wd
+ add r1,r1,r8 @increment the output pointer dst_strd-wd
+ bgt outer_loop_height_residue_4
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+outer_loop8_residual:
+ sub r12,r0,#3 @pu1_src - 3
+ mov r1,r7
+ mov r14,#32
+ add r1,#32
+ add r12,#16
+ mov r10,#8
+ add r8,#8
+ add r9,#8
+
+outer_loop_8:
+
+ add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
+ add r4,r12,r2 @pu1_src + src_strd
+ subs r5,r10,#0 @checks wd
+
+ ble end_inner_loop_8
+
+inner_loop_8:
+ vld1.u32 {d0},[r12],r11 @vector load pu1_src
+ vld1.u32 {d1},[r12],r11
+ vld1.u32 {d2},[r12],r11
+ vld1.u32 {d3},[r12],r11
+
+
+
+
+
+ @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2]
+ @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3]
+ @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4]
+ @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5]
+ @ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6]
+ @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7]
+ @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1]
+ @ vext.u8 d14,d12,d13,#2
+
+ @vext.u8 d15,d12,d13,#3 @vector extract of src[0_3]
+ @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4]
+ @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5]
+ @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6]
+ @vext.u8 d19,d12,d13,#7 @vector extract of src[0_7]
+ @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1]
+ vld1.u32 {d4},[r12],r11
+ vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {d5},[r12],r11
+ vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {d6},[r12],r11
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {d7},[r12],r11
+ vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd
+ vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ vld1.u32 {d13},[r4],r11
+ vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ vld1.u32 {d14},[r4],r11
+ vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vld1.u32 {d15},[r4],r11
+ vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+ vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd
+
+ vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {d17},[r4],r11
+ vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vld1.u32 {d18},[r4],r11
+ vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd
+ vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ @ vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1
+ vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+ vst1.16 {q4},[r1]! @store the result pu1_dst
+ vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+
+
+ @ vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2
+ subs r5,r5,#8 @decrement the wd loop
+ vst1.16 {q5},[r6]! @store the result pu1_dst
+ cmp r5,#4
+ bgt inner_loop_8
+
+end_inner_loop_8:
+ subs r14,r14,#2 @decrement the ht loop
+ add r12,r12,r9 @increment the src pointer by 2*src_strd-wd
+ add r1,r6,r8,lsl #1 @increment the dst pointer by 2*dst_strd-wd
+ bgt outer_loop_8
+
+
+
+
+
+ ldr r10,[sp,#48] @loads wd
+ cmp r10,#12
+
+ beq outer_loop4_residual
+
+ ldr r7,[sp,#44] @loads ht
+ and r7,r7,#1
+ cmp r7,#1
+ beq height_residue_4
+
+@end_loops
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+outer_loop_16:
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6,r1,r3,lsl #1 @pu1_dst + dst_strd
+ add r4,r12,r2 @pu1_src + src_strd
+ and r0, r12, #31
+ sub r5,r10,#0 @checks wd
+ @ble end_loops1
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ pld [r4, r2, lsl #1]
+ vld1.u32 {q1},[r12],r11
+ vld1.u32 {q2},[r12],r11
+ vld1.u32 {q3},[r12],r11
+ vld1.u32 {q6},[r12],r11
+ vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q7},[r12],r11
+ vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q8},[r12],r11
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q9},[r12],r11
+ vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+
+inner_loop_16:
+
+
+ subs r5,r5,#16
+ vmull.u8 q10,d3,d25
+
+ add r12,#8
+ vmlsl.u8 q10,d1,d24
+
+ vld1.u32 {q0},[r4],r11 @vector load pu1_src
+ vmlal.u8 q10,d7,d27
+
+ vld1.u32 {q1},[r4],r11
+ vmlsl.u8 q10,d5,d26
+
+ vld1.u32 {q2},[r4],r11
+ vmlal.u8 q10,d13,d28
+
+ vld1.u32 {q3},[r4],r11
+ vmlal.u8 q10,d17,d30
+
+ vld1.u32 {q6},[r4],r11
+ vmlsl.u8 q10,d15,d29
+
+ vld1.u32 {q7},[r4],r11
+ vmlsl.u8 q10,d19,d31
+
+ vld1.u32 {q8},[r4],r11
+ vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+
+ vld1.u32 {q9},[r4],r11
+ vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+
+ add r4,#8
+ vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ vst1.8 {q4},[r1]! @store the result pu1_dst
+ vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+
+ addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd
+ vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+
+ addeq r4,r12,r2 @pu1_src + src_strd
+ vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+
+@ and r7, r12, #31
+ vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+
+ subeq r14,r14,#2
+ vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+
+ @cmp r7, r0
+ vmull.u8 q11,d3,d25
+
+@ pld [r12, r2, lsl #2]
+ vmlsl.u8 q11,d1,d24
+
+ vst1.16 {q10},[r1]!
+ vmlal.u8 q11,d7,d27
+
+@ pld [r4, r2, lsl #2]
+ vmlsl.u8 q11,d5,d26
+
+@ mov r0, r7
+ vmlal.u8 q11,d13,d28
+
+ cmp r14,#0
+ vmlal.u8 q11,d17,d30
+
+ vst1.16 {q5},[r6]!
+ vmlsl.u8 q11,d15,d29
+
+ vmlsl.u8 q11,d19,d31
+
+ beq epilog_16
+
+ vld1.u32 {q0},[r12],r11 @vector load pu1_src
+ vld1.u32 {q1},[r12],r11
+ vld1.u32 {q2},[r12],r11
+ vld1.u32 {q3},[r12],r11
+ vld1.u32 {q6},[r12],r11
+ vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
+ vld1.u32 {q7},[r12],r11
+ vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
+ vld1.u32 {q8},[r12],r11
+ vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
+ vld1.u32 {q9},[r12],r11
+ vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
+ vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
+ cmp r5,#0
+ vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
+ moveq r5,r10
+ vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
+ vst1.8 {q11},[r6]! @store the result pu1_dst
+ vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
+ addeq r1,r6,r8,lsl #1
+ addeq r6,r1,r3,lsl #1 @pu1_dst + dst_strd
+ b inner_loop_16
+
+
+epilog_16:
+@ vqrshrun.s16 d11,q11,#6
+ vst1.8 {q11},[r6]! @store the result pu1_dst
+
+ ldr r7, [sp], #4
+ ldr r0, [sp], #4
+ ldr r10,[sp,#48]
+ cmp r10,#24
+ beq outer_loop8_residual
+ add r1,r6,r8,lsl #1
+ ldr r7,[sp,#44] @loads ht
+ and r7,r7,#1
+ cmp r7,#1
+ beq height_residue_4
+
+end_loops1:
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..c6716fe
--- /dev/null
+++ b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
@@ -0,0 +1,404 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@******************************************************************************
+@* @file
+@* ihevc_inter_pred_filters_luma_vert_w16inp.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@* - ihevc_inter_pred_luma_vert()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+@/* include reconstruction */
+@
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma vertical filter for 16bit input.
+@*
+@* @par description:
+@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+@* the elements pointed by 'pu1_src' and writes to the location pointed by
+@* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
+@* clipped to lie between 0 and 255 assumptions : the function is
+@* optimized considering the fact width is multiple of 4. and height as
+@* multiple of 2.
+@*
+@* @param[in] pi2_src
+@* word16 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the filter coefficients
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word8 *pi1_coeff,
+@ word32 ht,
+@ word32 wd )
+@**************variables vs registers*****************************************
+@ r0 => *pu2_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => *pi1_coeff
+@ r5 => ht
+@ r6 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q
+
+.type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function
+
+ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r12,[sp,#40] @load pi1_coeff
+ mov r6,r3,lsl #1
+ ldr r5,[sp,#48] @load wd
+ vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
+ mov r2, r2, lsl #1
+ sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
+ @vabs.s8 d0,d0 @vabs_s8(coeff)
+ add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
+ ldr r3,[sp,#44] @load ht
+ subs r7,r3,#0 @r3->ht
+ @ble end_loops @end loop jump
+ vmovl.s8 q0,d0
+ vdup.16 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
+ vdup.16 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
+ vdup.16 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
+ vdup.16 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
+ vdup.16 d26,d1[0] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
+ vdup.16 d27,d1[1] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
+ vdup.16 d28,d1[2] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
+ vdup.16 d29,d1[3] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
+ vmov.i32 q15,#0x80000
+
+ rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd
+ rsb r8,r5,r2,lsl #2 @r2->src_strd
+ sub r8,r8,r5
+ sub r9,r9,r5
+ mov r3, r5, lsr #2 @divide by 4
+ mul r7, r3 @multiply height by width
+ sub r7, #4 @subtract by one for epilog
+ mov r4,r5 @r5 ->wd
+ @mov r2, r2, lsl #1
+
+prolog:
+
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ subs r4,r4,#4
+ vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+
+ vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+ vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ addle r0,r0,r8,lsl #0
+ vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+ movle r4,r5 @r5 ->wd
+ vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+ vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+ vsub.s32 q4, q4, q15
+
+ vld1.16 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q6,d3,d23
+ vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q6,d2,d22
+ vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q6,d4,d24
+ vmlal.s16 q6,d5,d25
+ vmlal.s16 q6,d6,d26
+ vmlal.s16 q6,d7,d27
+ vmlal.s16 q6,d16,d28
+ vmlal.s16 q6,d17,d29
+ add r14,r1,r6
+ vsub.s32 q5, q5, q15
+ vshrn.s32 d8, q4, #6
+ @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vmull.s16 q7,d4,d23
+ vmlal.s16 q7,d3,d22
+ vmlal.s16 q7,d5,d24
+ vmlal.s16 q7,d6,d25
+ vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d7,d26
+ vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d16,d27
+ vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d17,d28
+ vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d18,d29
+ vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+ vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@
+ vsub.s32 q6, q6, q15
+ vshrn.s32 d10, q5, #6
+ @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ addle r1,r1,r9
+
+ subs r7,r7,#4
+
+
+ blt epilog_end @jumps to epilog_end
+ beq epilog @jumps to epilog
+
+kernel_8:
+
+ vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ subs r4,r4,#4
+ vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+ addle r0,r0,r8,lsl #0
+ vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+ vst1.32 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+
+ vsub.s32 q7, q7, q15
+ vshrn.s32 d12, q6, #6
+ @vqrshrun.s16 d12,q6,#6
+ vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+ vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+ vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vst1.32 {d12},[r14],r6
+
+ vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+
+ vsub.s32 q4, q4, q15
+ vshrn.s32 d14, q7, #6
+ @vqrshrun.s16 d14,q7,#6
+
+ vmull.s16 q6,d3,d23
+ movle r4,r5 @r5 ->wd
+
+ vmlal.s16 q6,d2,d22
+ vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q6,d4,d24
+ add r3,r0,r2 @pu1_src_tmp += src_strd@
+
+ vmlal.s16 q6,d5,d25
+
+ vmlal.s16 q6,d6,d26
+ vst1.32 {d14},[r14],r6
+
+ vmlal.s16 q6,d7,d27
+ vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q6,d16,d28
+ add r14,r1,r6
+
+ vmlal.s16 q6,d17,d29
+ vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
+
+ vsub.s32 q5, q5, q15
+ vshrn.s32 d8, q4, #6
+ @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+
+ vmull.s16 q7,d4,d23
+ vmlal.s16 q7,d3,d22
+ vmlal.s16 q7,d5,d24
+ vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+
+ vmlal.s16 q7,d6,d25
+ vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d7,d26
+ vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d16,d27
+ vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d17,d28
+ vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
+ vmlal.s16 q7,d18,d29
+ vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@
+
+ vsub.s32 q6, q6, q15
+ vshrn.s32 d10, q5, #6
+ addle r1,r1,r9
+
+ @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+ subs r7,r7,#4
+
+ bgt kernel_8 @jumps to kernel_8
+
+epilog:
+
+ vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
+ vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
+ vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
+ vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
+ vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
+ vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
+ vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
+ vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
+ vst1.32 {d10},[r14],r6
+
+ vsub.s32 q7, q7, q15
+ vshrn.s32 d12, q6, #6
+ @vqrshrun.s16 d12,q6,#6
+
+ vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
+ vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
+ vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
+ vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
+ vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
+ vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
+ vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
+ vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
+ vst1.32 {d12},[r14],r6
+
+ vsub.s32 q4, q4, q15
+ vshrn.s32 d14, q7, #6
+ @vqrshrun.s16 d14,q7,#6
+
+ vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q6,d3,d23
+ vmlal.s16 q6,d2,d22
+ vmlal.s16 q6,d4,d24
+ vmlal.s16 q6,d5,d25
+ vmlal.s16 q6,d6,d26
+ vmlal.s16 q6,d7,d27
+ vmlal.s16 q6,d16,d28
+ vmlal.s16 q6,d17,d29
+ vst1.32 {d14},[r14],r6
+ vsub.s32 q5, q5, q15
+ vshrn.s32 d8, q4, #6
+ @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
+ vmull.s16 q7,d4,d23
+ vmlal.s16 q7,d3,d22
+ vmlal.s16 q7,d5,d24
+ vmlal.s16 q7,d6,d25
+ vmlal.s16 q7,d7,d26
+ vmlal.s16 q7,d16,d27
+ vmlal.s16 q7,d17,d28
+ vmlal.s16 q7,d18,d29
+ vsub.s32 q6, q6, q15
+ vshrn.s32 d10, q5, #6
+ @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
+
+ add r14,r1,r6
+ vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@
+
+epilog_end:
+ vst1.32 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
+ vshrn.s32 d12, q6, #6
+ @vqrshrun.s16 d12,q6,#6
+
+ vst1.32 {d12},[r14],r6
+ vsub.s32 q7, q7, q15
+ vshrn.s32 d14, q7, #6
+ @vqrshrun.s16 d14,q7,#6
+
+ vst1.32 {d14},[r14],r6
+
+
+end_loops:
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_dc.s b/common/arm/ihevc_intra_pred_chroma_dc.s
new file mode 100644
index 0000000..72d9730
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_dc.s
@@ -0,0 +1,292 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_chroma_dc_neon.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_dc_a9q
+
+.type ihevc_intra_pred_chroma_dc_a9q, %function
+
+ihevc_intra_pred_chroma_dc_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ mov r9, #0
+ vmov d17, r9, r9
+
+ clz r5, r4 @counts leading zeros
+
+ add r6, r0, r4,lsl #1 @&src[2nt]
+ vmov d18, r9, r9
+ rsb r5, r5, #32 @log2nt
+ add r7, r0, r4, lsl #2 @&src[4nt]
+ mov r12,r5
+ add r8, r7, #2 @&src[4nt+2]
+
+ cmp r4, #4
+ beq dc_4 @nt=4 loop
+
+
+add_loop:
+ vld2.s8 {d30,d31}, [r6]! @load from src[nt]
+ lsl r10,r4,#1 @2nt
+
+ vpaddl.u8 d2, d30
+ subs r10, #0x10
+
+ vld2.s8 {d26,d27}, [r8]! @load from src[2nt+1]
+
+ vpaddl.u8 d3, d31
+ vpaddl.u16 d2, d2
+ vpaddl.u16 d3, d3
+
+ vpadal.u32 d17, d2
+
+ vpadal.u32 d18, d3
+
+ vpaddl.u8 d2, d26
+ vpaddl.u8 d3, d27
+
+ vpaddl.u16 d2, d2
+ vpaddl.u16 d3, d3
+
+ vpadal.u32 d17, d2
+ vpadal.u32 d18, d3
+
+ beq epil_add_loop
+
+core_loop_add:
+ vld2.s8 {d30,d31}, [r6]! @load from src[nt]
+ vpaddl.u8 d28, d30
+ vpaddl.u8 d3, d31
+
+ vld2.s8 {d26,d27}, [r8]! @load from src[2nt+1]
+
+ vpaddl.u16 d3, d3
+ vpaddl.u16 d29, d28
+
+ vpadal.u32 d18, d3
+ vpadal.u32 d17, d29
+
+ vpaddl.u8 d3, d27
+ vpaddl.u8 d28, d26
+
+ vpaddl.u16 d3, d3
+ vpaddl.u16 d29, d28
+
+ vpadal.u32 d18, d3
+ vpadal.u32 d17, d29
+
+
+epil_add_loop:
+
+ vmov.32 r1,d18[0]
+ vmov.32 r11,d17[0]
+
+ add r1,r1,r4
+ add r11,r11,r4
+
+ lsr r1,r1,r12
+ lsr r11,r11,r12
+
+ vdup.8 d17,r1
+ vdup.8 d16,r11
+
+prologue_cpy_32:
+
+ add r5, r2, r3
+ subs r9, r4, #8
+ lsl r6, r3, #2
+ moveq r11,r6
+ add r8, r5, r3
+ add r10, r8, r3
+
+ beq epilogue_copy
+
+ vst2.8 {d16,d17}, [r2]!
+ add r6, r6, #0xfffffff0
+
+ vst2.8 {d16,d17}, [r5]!
+ vst2.8 {d16,d17}, [r8]!
+ movne r11,#16
+ vst2.8 {d16,d17}, [r10]!
+
+
+ vst2.8 {d16,d17}, [r2], r6
+ vst2.8 {d16,d17}, [r5], r6
+ vst2.8 {d16,d17}, [r8], r6
+ vst2.8 {d16,d17}, [r10], r6
+
+kernel_copy:
+ vst2.8 {d16,d17}, [r2]!
+ vst2.8 {d16,d17}, [r5]!
+ vst2.8 {d16,d17}, [r8]!
+ vst2.8 {d16,d17}, [r10]!
+
+ vst2.8 {d16,d17}, [r2], r6
+ vst2.8 {d16,d17}, [r5], r6
+ vst2.8 {d16,d17}, [r8], r6
+ vst2.8 {d16,d17}, [r10], r6
+
+ vst2.8 {d16,d17}, [r2]!
+ vst2.8 {d16,d17}, [r5]!
+ vst2.8 {d16,d17}, [r8]!
+ vst2.8 {d16,d17}, [r10]!
+
+ vst2.8 {d16,d17}, [r2], r6
+ vst2.8 {d16,d17}, [r5], r6
+ vst2.8 {d16,d17}, [r8], r6
+ vst2.8 {d16,d17}, [r10], r6
+
+epilogue_copy:
+ vst2.8 {d16,d17}, [r2],r11
+ vst2.8 {d16,d17}, [r5],r11
+ vst2.8 {d16,d17}, [r8],r11
+ vst2.8 {d16,d17}, [r10],r11
+
+ vst2.8 {d16,d17}, [r2]
+ vst2.8 {d16,d17}, [r5]
+ vst2.8 {d16,d17}, [r8]
+ vst2.8 {d16,d17}, [r10]
+ b end_func
+
+dc_4:
+ vld2.s8 {d30,d31},[r6] @load from src[nt]
+ vshl.i64 d3,d30,#32
+
+ vld2.s8 {d26,d27},[r8] @load from src[2nt+1]
+ vshl.i64 d2,d31,#32
+
+ vpaddl.u8 d3,d3
+ vpaddl.u8 d2,d2
+ vpaddl.u16 d3,d3
+ vpaddl.u16 d2,d2
+ vpadal.u32 d17,d3
+ vpadal.u32 d18,d2
+
+ vshl.i64 d3,d26,#32
+ vshl.i64 d2,d27,#32
+ vpaddl.u8 d3,d3
+ vpaddl.u8 d2,d2
+ vpaddl.u16 d3,d3
+ vpaddl.u16 d2,d2
+ vpadal.u32 d17,d3
+ vpadal.u32 d18,d2
+
+ vmov.32 r10,d17[0]
+ vmov.32 r11,d18[0]
+
+ add r10,r10,r4
+ add r11,r11,r4
+ lsr r10,r10,r12
+ lsr r11,r11,r12
+ orr r10,r10,r11,lsl #8
+ vdup.16 d0,r10
+
+ vst1.8 {d0},[r2],r3
+ vst1.8 {d0},[r2],r3
+ vst1.8 {d0},[r2],r3
+ vst1.8 {d0},[r2]
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_horz.s b/common/arm/ihevc_intra_pred_chroma_horz.s
new file mode 100644
index 0000000..6089fd8
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_horz.s
@@ -0,0 +1,346 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_chroma_horz_neon.s
+@*
+@* @brief
+@* contains function definition for intra prediction interpolation filters
+@*
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@* - ihevc_intra_pred_luma_horz()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* intra prediction interpolation filter for horizontal luma variable.
+@*
+@* @par description:
+@* horizontal intraprediction(mode 10) with.extern samples location
+@* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer
+@* to section 8.4.4.2.6 in the standard (special case)
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* integer transform block size
+@*
+@* @param[in] mode
+@* integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_horz_a9q
+
+.type ihevc_intra_pred_chroma_horz_a9q, %function
+
+ihevc_intra_pred_chroma_horz_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+
+ lsl r6,r4,#2 @four_nt
+
+ add r12,r0,r6 @*pu1_ref[four_nt]
+ cmp r4,#4 @if nt == 4
+ beq core_loop_4
+
+ cmp r4,#8 @if nt == 8
+ beq core_loop_8
+
+ @cmp r4,#16 @if nt == 16
+ @beq core_loop_16
+
+ sub r12,r12,#16 @move to 16th value pointer
+ add r9,r2,#16
+
+core_loop_16:
+ vld1.16 {q0},[r12] @load 16 values. d1[7] will have the 1st value.
+ sub r12,r12,#16
+ vld1.16 {q5},[r12] @load 16 values. d1[7] will have the 1st value.
+
+ vdup.16 q1,d1[3] @duplicate the i value.
+
+ vdup.16 q2,d1[2] @duplicate the ii value.
+ vdup.16 q3,d1[1] @duplicate the iii value.
+ vst1.16 {q1},[r2],r3 @store in 1st row 0-16 columns
+ vst1.16 {q1},[r9],r3 @store in 1st row 16-32 columns
+
+ vdup.16 q4,d1[0]
+ vst1.16 {q2},[r2],r3
+ vst1.16 {q2},[r9],r3
+
+ vdup.16 q1,d0[3]
+ vst1.16 {q3},[r2],r3
+ vst1.16 {q3},[r9],r3
+
+ vdup.16 q2,d0[2]
+ vst1.16 {q4},[r2],r3
+ vst1.16 {q4},[r9],r3
+
+ vdup.16 q3,d0[1]
+ vst1.16 {q1},[r2],r3
+ vst1.16 {q1},[r9],r3
+
+ vdup.16 q4,d0[0]
+ vst1.16 {q2},[r2],r3
+ vst1.16 {q2},[r9],r3
+
+ vdup.16 q1,d11[3]
+ vst1.16 {q3},[r2],r3
+ vst1.16 {q3},[r9],r3
+
+ vdup.16 q2,d11[2]
+ vst1.16 {q4},[r2],r3
+ vst1.16 {q4},[r9],r3
+
+ vdup.16 q3,d11[1]
+ vst1.16 {q1},[r2],r3
+ vst1.16 {q1},[r9],r3
+
+ vdup.16 q4,d11[0]
+ vst1.16 {q2},[r2],r3
+ vst1.16 {q2},[r9],r3
+
+ vdup.16 q1,d10[3]
+ vst1.16 {q3},[r2],r3
+ vst1.16 {q3},[r9],r3
+
+ vdup.16 q2,d10[2]
+ vst1.16 {q4},[r2],r3
+ vst1.16 {q4},[r9],r3
+
+ vdup.16 q3,d10[1]
+ vst1.16 {q1},[r2],r3
+ vst1.16 {q1},[r9],r3
+ sub r12,r12,#16 @move to 16th value pointer
+
+ vdup.16 q4,d10[0]
+ vst1.16 {q2},[r2],r3
+ vst1.16 {q2},[r9],r3
+
+ subs r4,r4,#16 @decrement the loop count by 16
+ vst1.16 {q3},[r2],r3
+ vst1.16 {q3},[r9],r3
+
+ vst1.16 {q4},[r2],r3
+ vst1.16 {q4},[r9],r3
+ bgt core_loop_16
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+ b endloop
+
+core_loop_8:
+ ldrb lr,[r12],#1 @pu1_ref[two_nt]
+ @vld1.8 {q15},[r12] @pu1_ref[two_nt + 1 + col]
+
+ vdup.8 d28,lr
+ sub r12,r12,#17
+ vld1.8 {q0},[r12]
+
+ sub r12,r12,#16
+ vld1.8 {q15},[r12]
+ vdup.16 q5,d1[3]
+ @vmovl.u8 q13,d26
+
+ vdup.16 q1,d1[2]
+ @vsubl.u8 q12,d30,d28
+
+ vdup.16 q2,d1[1]
+ @vshr.s16 q12,q12,#1
+
+ vdup.16 q3,d1[0]
+ @vqadd.s16 q11,q13,q12
+
+ vdup.16 q4,d0[3]
+ @vqmovun.s16 d22,q11
+
+ vst1.16 {q5},[r2],r3
+
+ vdup.16 q5,d0[2]
+ @vsubl.u8 q12,d31,d28
+
+ vdup.16 q6,d0[1]
+ @vshr.s16 q12,q12,#1
+
+ vdup.16 q7,d0[0]
+ @vqadd.s16 q11,q13,q12
+
+ vdup.16 q8,d0[3]
+ @vqmovun.s16 d22,q11
+
+ vst1.16 {q1},[r2],r3
+ @sub r2,r2,#8
+
+ vst1.16 {q2},[r2],r3
+
+ vst1.16 {q3},[r2],r3
+ vst1.16 {q4},[r2],r3
+ vst1.16 {q5},[r2],r3
+
+ @vdup.8 q1,d0[2]
+ vst1.16 {q6},[r2],r3
+
+ @vdup.8 q2,d0[1]
+ vst1.16 {q7},[r2],r3
+
+ @vdup.8 q3,d0[0]
+ @vst1.8 {q7},[r2],r3
+
+ @vdup.8 q4,d0[3]
+ @vst1.8 {q8},[r2],r3
+
+ @vdup.8 q5,d0[2]
+ @vst1.8 {q1},[r2],r3
+
+ @vdup.8 q6,d0[1]
+ @vst1.8 {q2},[r2],r3
+
+ @vdup.8 q7,d0[0]
+ @vst1.8 {q3},[r2],r3
+
+ @vst1.8 {q4},[r2],r3
+ @vst1.8 {q5},[r2],r3
+ @vst1.8 {q6},[r2],r3
+ @vst1.8 {q7},[r2],r3
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+ b endloop
+
+
+core_loop_4:
+ ldrb lr,[r12] @pu1_ref[two_nt]
+ add r12,r12,#1 @pu1_ref[two_nt + 1]
+ @vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col]
+
+ sub r12,r12,#9
+ vld1.8 {d0},[r12]
+ sub r12,r12,#8
+ vld1.8 {d30},[r12]
+ vdup.16 d26,d0[3]
+ vdup.8 d28,lr
+
+ vdup.16 d3,d0[2]
+ vmovl.u8 q13,d26
+
+ vdup.16 d4,d0[1]
+ vsubl.u8 q12,d30,d28
+
+ vdup.16 d5,d0[0]
+ vshr.s16 q12,q12,#1
+
+ vdup.16 d6,d0[3]
+ vqadd.s16 q11,q13,q12
+
+ vdup.16 d7,d0[2]
+ vqmovun.s16 d22,q11
+
+ vst1.8 {d6},[r2],r3
+ vst1.8 {d3},[r2],r3
+
+ vdup.16 d8,d0[1]
+ vst1.8 {d4},[r2],r3
+ vst1.8 {d5},[r2],r3
+
+ vdup.16 d9,d0[0]
+ @vst1.8 {d6},[r2],r3
+ @vst1.8 {d7},[r2],r3
+
+ @vst1.8 {d8},[r2],r3
+ @vst1.8 {d9},[r2],r3
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+ b endloop
+
+
+@core_loop_4
+ ldrb lr,[r12] @pu1_ref[two_nt]
+ add r12,r12,#1 @pu1_ref[two_nt + 1]
+ vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col]
+
+ sub r12,r12,#5
+ vld1.8 {d0},[r12]
+ vdup.8 d28,lr
+ vdup.8 d26,d0[3]
+ vmovl.u8 q13,d26
+
+ vdup.8 d3,d0[2]
+ vsubl.u8 q12,d30,d28
+
+ vdup.8 d4,d0[1]
+ vshr.s16 q12,q12,#1
+
+ vdup.8 d5,d0[0]
+ vqadd.s16 q11,q13,q12
+
+ vqmovun.s16 d22,q11
+
+ vst1.32 {d22[0]},[r2],r3
+ vst1.32 {d3[0]},[r2],r3
+ vst1.32 {d4[0]},[r2],r3
+ vst1.32 {d5[0]},[r2],r3
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+endloop:
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_mode2.s b/common/arm/ihevc_intra_pred_chroma_mode2.s
new file mode 100644
index 0000000..cfa2ddb
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode2.s
@@ -0,0 +1,299 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode2_neon.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode2_a9q
+
+.type ihevc_intra_pred_chroma_mode2_a9q, %function
+
+ihevc_intra_pred_chroma_mode2_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ mov r8,#-4
+
+ cmp r4,#4
+ beq mode2_4
+
+ add r0,r0,r4,lsl #2
+
+ sub r0,r0,#0x12 @src[1]
+ add r10,r0,#-2
+
+prologue_cpy_32:
+
+ vld2.8 {d0,d1},[r0],r8
+
+ mov r11,r4
+ vrev64.8 d16,d0
+ vrev64.8 d17,d1
+
+ vld2.8 {d2,d3},[r10],r8
+ mov r6, r2
+
+ vld2.8 {d4,d5},[r0],r8
+ vld2.8 {d6,d7},[r10],r8
+ lsr r1, r4, #3
+
+ vld2.8 {d8,d9},[r0],r8
+ vld2.8 {d10,d11},[r10],r8
+ vld2.8 {d12,d13},[r0],r8
+ mul r1, r4, r1
+
+ vld2.8 {d14,d15},[r10],r8
+ add r7,r6,r3
+
+ vrev64.8 d18,d2
+ vrev64.8 d19,d3
+ lsl r5, r3, #2
+
+ vrev64.8 d20,d4
+ vrev64.8 d21,d5
+ add r9,r7,r3
+
+ vrev64.8 d22,d6
+ vrev64.8 d23,d7
+
+ vrev64.8 d24,d8
+ vrev64.8 d25,d9
+
+ vrev64.8 d26,d10
+ subs r1,r1,#8
+
+ vrev64.8 d27,d11
+
+ vrev64.8 d28,d12
+ vrev64.8 d29,d13
+
+ vrev64.8 d30,d14
+ add r14,r9,r3
+ vrev64.8 d31,d15
+
+ beq epilogue_mode2
+
+ sub r12,r4,#8
+
+kernel_mode2:
+
+ vst2.8 {d16,d17},[r6],r5
+ vst2.8 {d18,d19},[r7],r5
+ subs r11,r11,#8
+ vst2.8 {d20,d21},[r9],r5
+ vst2.8 {d22,d23},[r14],r5
+ vst2.8 {d24,d25},[r6],r5
+ addgt r2,r2,#16
+ vst2.8 {d26,d27},[r7],r5
+ vst2.8 {d28,d29},[r9],r5
+ vst2.8 {d30,d31},[r14],r5
+
+ vld2.8 {d0,d1},[r0],r8
+ movle r11,r4
+
+ vld2.8 {d2,d3},[r10],r8
+ vld2.8 {d4,d5},[r0],r8
+ addle r2, r2, r3, lsl #2
+ vld2.8 {d6,d7},[r10],r8
+ vrev64.8 d16,d0
+
+ vld2.8 {d8,d9},[r0],r8
+ vld2.8 {d10,d11},[r10],r8
+ suble r2, r6,#16
+ vld2.8 {d12,d13},[r0],r8
+ vrev64.8 d17,d1
+ vld2.8 {d14,d15},[r10],r8
+
+ subs r12,r12,#8
+ mov r6, r2
+ addle r0, r0, r4,lsl #1
+ add r7, r6, r3
+
+ vrev64.8 d18,d2
+ suble r0, r0, #16
+ vrev64.8 d19,d3
+
+ vrev64.8 d20,d4
+ movle r12,r4
+ vrev64.8 d21,d5
+
+ vrev64.8 d22,d6
+ add r9, r7, r3
+ vrev64.8 d23,d7
+
+ vrev64.8 d24,d8
+ add r10,r0,#-2
+ vrev64.8 d25,d9
+
+ vrev64.8 d26,d10
+ subs r1, r1, #8
+ vrev64.8 d27,d11
+
+ vrev64.8 d28,d12
+ vrev64.8 d29,d13
+
+ vrev64.8 d30,d14
+ add r14, r9, r3
+ vrev64.8 d31,d15
+
+ bne kernel_mode2
+
+epilogue_mode2:
+
+ vst2.8 {d16,d17},[r6],r5
+ vst2.8 {d18,d19},[r7],r5
+ vst2.8 {d20,d21},[r9],r5
+ vst2.8 {d22,d23},[r14],r5
+ vst2.8 {d24,d25},[r6],r5
+ vst2.8 {d26,d27},[r7],r5
+ vst2.8 {d28,d29},[r9],r5
+ vst2.8 {d30,d31},[r14],r5
+
+ b end_func
+
+mode2_4:
+
+ lsl r12,r4,#1
+ add r0,r0,r12
+ sub r0,r0,#2
+
+ vld2.8 {d12,d13},[r0],r8
+ vshl.i64 d0,d12,#32
+ add r10,r0,#2
+ vshl.i64 d1,d13,#32
+
+ vrev64.8 d0,d0
+ vld2.8 {d14,d15},[r10],r8
+ vshl.i64 d2,d14,#32
+
+ vrev64.8 d1,d1
+ vshl.i64 d3,d15,#32
+ vzip.8 d0,d1
+ vst1.8 {d0},[r2],r3
+
+ vrev64.8 d2,d2
+ vld2.8 {d16,d17},[r0],r8
+ vshl.i64 d4,d16,#32
+ vrev64.8 d3,d3
+ vshl.i64 d5,d17,#32
+ vzip.8 d2,d3
+ vrev64.8 d4,d4
+ vrev64.8 d5,d5
+ vst1.8 {d2},[r2],r3
+
+
+ vld2.8 {d18,d19},[r10],r8
+ vshl.i64 d6,d18,#32
+
+ vzip.8 d4,d5
+ vshl.i64 d7,d19,#32
+ vrev64.8 d6,d6
+ vst1.8 {d4},[r2],r3
+
+ vrev64.8 d7,d7
+ vzip.8 d6,d7
+ vst1.8 {d6},[r2],r3
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
new file mode 100644
index 0000000..b0dd1fa
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s
@@ -0,0 +1,190 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode_18_34_neon.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_mode_18_34(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_18_34_a9q
+
+.type ihevc_intra_pred_chroma_mode_18_34_a9q, %function
+
+ihevc_intra_pred_chroma_mode_18_34_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+
+ ldr r4,[sp,#40]
+ ldr r5,[sp,#44]
+
+ cmp r4,#4
+ beq mode2_4
+
+ mov r12,r4
+ mov r11,r4
+ add r0,r0,r4,lsl #2
+
+ cmp r5,#0x22
+ mov r10,r2
+
+ add r0,r0,#4
+
+ subne r0,r0,#4
+ moveq r6,#2
+ movne r6,#-2
+ mov r8,r0
+
+
+kernel:
+
+
+ vld1.8 {d0,d1},[r8],r6
+ vst1.8 {d0,d1},[r10],r3
+ vld1.8 {d2,d3},[r8],r6
+ vst1.8 {d2,d3},[r10],r3
+ vld1.8 {d4,d5},[r8],r6
+ vst1.8 {d4,d5},[r10],r3
+ vld1.8 {d6,d7},[r8],r6
+ vst1.8 {d6,d7},[r10],r3
+ vld1.8 {d8,d9},[r8],r6
+ vst1.8 {d8,d9},[r10],r3
+ vld1.8 {d10,d11},[r8],r6
+ vst1.8 {d10,d11},[r10],r3
+ vld1.8 {d12,d13},[r8],r6
+ vst1.8 {d12,d13},[r10],r3
+ vld1.8 {d14,d15},[r8],r6
+ vst1.8 {d14,d15},[r10],r3
+
+ subs r12,r12,#8
+ bne kernel
+
+ cmp r11,#16
+ add r8,r0,#16
+ add r10,r2,#16
+ sub r11,#16
+ mov r12,#16
+ beq kernel
+ b end_func
+
+mode2_4:
+
+ add r0,r0,#20
+ cmp r5,#0x22
+ subne r0,r0,#4
+
+ moveq r8,#2
+ movne r8,#-2
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0},[r2],r3
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0},[r2],r3
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0},[r2],r3
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0},[r2],r3
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
new file mode 100644
index 0000000..f2431e1
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -0,0 +1,542 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_chroma_mode_27_to_33.s
+@*
+@* @brief
+@* contains function definition for intra prediction interpolation filters
+@*
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@* - ihevc_intra_pred_chroma_mode_27_to_33()
+@*
+@* @remarksll
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+@*.extern neighboring samples location pointed by 'pu1_ref' to the tu
+@* block location pointed by 'pu1_dst'
+@*
+@* @par description:
+@*
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[in] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* integer transform block size
+@*
+@* @param[in] mode
+@* integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@.if intra_pred_chroma_27_t0_33 == c
+@void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_27_to_33_a9q
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl2 - 8
+
+
+.type ihevc_intra_pred_chroma_mode_27_to_33_a9q, %function
+
+ihevc_intra_pred_chroma_mode_27_to_33_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r5,[sp,#44] @loads mode
+ ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
+ulbl1:
+ add r6,r6,pc
+
+ lsl r7,r4,#2 @four_nt
+
+ add r8,r6,r5,lsl #2 @*gai4_ihevc_ang_table[mode]
+ ldr r9,[r8] @intra_pred_ang = gai4_ihevc_ang_table[mode]
+ ldr r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+ add r1,r1,pc
+ add r6,r1,#1
+
+ tst r4,#7
+ add r8,r0,r7 @pu1_ref + four_nt
+ mov lr,#0 @row
+ mov r12,r4
+ bne core_loop_4
+ lsl r4,r4,#1
+ b core_loop_8
+
+core_loop_8:
+ add r8,r8,#2 @pu1_ref_main_idx += (four_nt + 1)
+ vdup.8 d0,r9 @intra_pred_ang
+ mov r12,r4,lsr #4 @divide by 8
+
+ vmov.i8 d1,#32
+ mul r7,r4,r12
+
+ vmov.i16 q3,#31
+
+ mov r1,r8
+ mov r5,r4
+ mov r11,#2
+
+prologue:
+ vld1.8 {d3},[r6] @loads the row value
+ vmull.u8 q1,d3,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.u16 d5,q1,#5 @idx = pos >> 5
+
+ vdup.8 d31,d4[0]
+ add r0,r2,r3
+
+ vmov.u32 lr,d5[0] @(i row)extract idx to the r register
+ lsl lr,lr,#1
+
+ vdup.8 d29,d4[1] @(ii)
+ and r9,lr,#0xff @(i row) get the last byte
+
+ add r10,r8,r9 @(i row)*pu1_ref[ref_main_idx]
+
+ asr lr,lr,#8 @(ii)shift by 8
+ vld1.8 {d8},[r10],r11 @(i row)ref_main_idx
+ and r9,lr,#0xff @(ii)get the last byte
+
+ asr lr,lr,#8 @(iii)
+ vld1.8 {d9},[r10] @(i row)ref_main_idx_1
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ and r9,lr,#0xff @(iii)
+ vsub.u8 d30,d1,d31 @32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr lr,lr,#8 @(iv)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ and r9,lr,#0xff @(iv)
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+
+ vdup.8 d31,d4[4] @(v)
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vmov.u32 lr,d5[1] @extract idx to the r register
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ lsl lr,lr,#1
+
+ vst1.8 {d10},[r2]! @(i row)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ and r9,lr,#0xff @(v)
+ vdup.8 d29,d4[5] @(vi)
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+
+ asr lr,lr,#8 @(vi)
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and r9,lr,#0xff @(vi)
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+ vdup.8 d27,d4[6] @(vii)
+ asr lr,lr,#8 @(vii)
+
+ and r9,lr,#0xff @(vii)
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ asr lr,lr,#8 @(viii)
+ vdup.8 d25,d4[7] @(viii)
+ and r9,lr,#0xff @(viii)
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs r7,r7,#8
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ addgt r8,r8,#8
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subgt r4,r4,#8
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmull.u8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.u16 d3,q1,#5 @idx = pos >> 5
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+ lsl lr,lr,#1
+ and r9,lr,#0xff @(i)
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ asr lr,lr,#8 @(ii)
+ vdup.8 d31,d4[0]
+ subs r4,r4,#8
+
+ vld1.8 {d8},[r10],r11 @(i)ref_main_idx
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+ and r9,lr,#0xff @(ii)
+ addle r6,r6,#8 @increment the row value
+
+ vld1.8 {d9},[r10] @(i)ref_main_idx_1
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr lr,lr,#8 @(iii)
+
+ vdup.8 d29,d4[1] @(ii)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+ and r9,lr,#0xff @(iii)
+
+ vst1.8 {d14},[r0],r3 @(vi)
+ vsub.u8 d30,d1,d31 @(i)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ asr lr,lr,#8 @(iv)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and r9,lr,#0xff @(iv)
+
+ vmov.u32 lr,d3[1] @extract idx to the r register
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ movle r4,r5 @reload nt
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.u8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+
+ vst1.8 {d22},[r0] @(viii)
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ lsl lr,lr,#1
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add r0,r2,r3
+
+ vdup.8 d31,d4[4] @(v)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+ and r9,lr,#0xff @(v)
+
+ vst1.8 {d10},[r2]! @(i)
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+
+ vdup.8 d29,d4[5] @(vi)
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ asr lr,lr,#8 @(vi)
+
+ vdup.8 d27,d4[6] @(vii)
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and r9,lr,#0xff @(vi)
+
+ vdup.8 d25,d4[7] @(viii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ asr lr,lr,#8 @(vii)
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vshrn.u16 d3,q1,#5 @idx = pos >> 5
+ and r9,lr,#0xff @(vii)
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+ asr lr,lr,#8 @(viii)
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and r9,lr,#0xff @(viii)
+
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+ movle r8,r1 @reload the source to pu1_src+2nt
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ addgt r8,r8,#8 @increment the source next set 8 columns in same row
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+ lslle r12,r3,#3
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ suble r12,r12,r5
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ addle r2,r2,r12 @increment the dst pointer to 8*dst_strd - nt
+
+ vmovn.i16 d4,q2
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+ lsl lr,lr,#1
+
+ and r9,lr,#0xff @(i)
+ subs r7,r7,#8
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ vst1.8 {d14},[r0],r3 @(vi)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vst1.8 {d22},[r0],r3 @(viii)
+ b end_loops
+
+core_loop_4:
+ add r10,r8,#2 @pu1_ref_main_idx += (four_nt + 1)
+ add r11,r8,#4 @pu1_ref_main_idx_1 += (four_nt + 2)
+ mov r8,#0
+
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#2 @pu1_ref_main_idx += 2
+ add r11,r10,#2 @pu1_ref_main_idx_1 += 2
+ vdup.8 d0,r5 @dup_const_fract
+ rsb r4,r5,#32
+ vdup.8 d1,r4 @dup_const_32_fract
+
+@inner_loop_4
+ vld1.8 {d2},[r10] @ref_main_idx
+ add r8,r8,#1
+ mov lr,r5 @fract_prev = fract
+
+ vld1.8 {d3},[r11] @ref_main_idx_1
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#2 @pu1_ref_main_idx += 1
+ add r11,r10,#2 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d6,r5 @dup_const_fract
+ vmull.u8 q2,d2,d1 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d7,r4 @dup_const_32_fract
+ vmlal.u8 q2,d3,d0 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d8},[r10] @ref_main_idx
+ add r8,r8,#1
+
+ vld1.8 {d9},[r11] @ref_main_idx_1
+ vrshrn.i16 d4,q2,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ mov lr,r5 @fract_prev = fract
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#2 @pu1_ref_main_idx += 1
+ add r11,r10,#2 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d12,r5 @dup_const_fract
+ vmull.u8 q5,d8,d7 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d13,r4 @dup_const_32_fract
+ vmlal.u8 q5,d9,d6 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d14},[r10] @ref_main_idx
+ add r8,r8,#1
+
+ vst1.8 {d4},[r2],r3
+ vrshrn.i16 d10,q5,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d15},[r11] @ref_main_idx_1
+ mov lr,r5 @fract_prev = fract
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#2 @pu1_ref_main_idx += 1
+ add r11,r10,#2 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d18,r5 @dup_const_fract
+ vmull.u8 q8,d14,d13 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d19,r4 @dup_const_32_fract
+ vmlal.u8 q8,d15,d12 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d20},[r10] @ref_main_idx
+
+ vst1.8 {d10},[r2],r3
+ vrshrn.i16 d16,q8,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+ vld1.8 {d21},[r11] @ref_main_idx_1
+
+ vmull.u8 q11,d20,d19 @vmull_u8(ref_main_idx, dup_const_32_fract)
+ vmlal.u8 q11,d21,d18 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d16},[r2],r3
+ vrshrn.i16 d22,q11,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ vst1.8 {d22},[r2],r3
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
new file mode 100644
index 0000000..a5eb3ca
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
@@ -0,0 +1,497 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_chroma_mode_3_to_9.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_3_to_9_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_3_9
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+
+idx_neg_idx_chroma_3_9_addr:
+.long idx_neg_idx_chroma_3_9 - ulbl3 - 8
+
+col_for_intra_chroma_addr_1:
+.long col_for_intra_chroma - ulbl4 - 8
+
+col_for_intra_chroma_addr_2:
+.long col_for_intra_chroma - ulbl5 - 8
+
+col_for_intra_chroma_addr_3:
+.long col_for_intra_chroma - ulbl6 - 8
+
+.type ihevc_intra_pred_chroma_mode_3_to_9_a9q, %function
+
+ihevc_intra_pred_chroma_mode_3_to_9_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r7, gai4_ihevc_ang_table_addr
+ulbl1:
+ add r7,r7,pc
+
+ ldr r5,[sp,#44] @mode (3 to 9)
+ ldr r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+ add r8,r8,pc
+
+ add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode]
+ ldr r7, [r7] @intra_pred_ang
+ vdup.8 d30, r7 @intra_pred_ang
+
+ ldr r14, col_for_intra_chroma_addr_1
+ulbl4:
+ add r14,r14,pc
+
+prologue_8_16_32:
+ lsr r10, r4, #3
+ vld1.8 d31, [r14]!
+ mul r10, r4, r10 @block counter (dec by #8)
+
+ mov r11, r4, lsl #1 @col counter to be inc/dec by #8
+ vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+
+ sub r7, r5, #3
+ ldr r12, idx_neg_idx_chroma_3_9_addr @load most idx table
+ulbl3:
+ add r12,r12,pc
+
+ add r12, r12, r7, lsl #4
+ mov r8, r12
+
+ mov r7, #8
+ sub r7, r7, r3, lsl #3 @r7 = 8-8r3
+
+ ldr r9, [r8]
+ mov r9, r9, lsl #1
+ add r1, r0, r4, lsl #2 @pu1_ref + 4*nt
+
+ vmovn.s16 d6, q11
+ vdup.8 d26, r9 @most idx added to final idx values
+ sub r1, r1, #26 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+ sub r6, r1, r9
+
+ vld1.8 {d0,d1,d2,d3}, [r6] @stores the 32 values reqd based on indices values (from most idx)
+ vshr.s16 q11, q11, #5
+
+ vmov.i8 d29, #31 @contains #31 for vand operation
+
+ vmov.i8 d28, #32
+
+ vqmovn.s16 d8, q11
+ vshl.s8 d8, d8, #1 @ 2 * idx
+
+ vand d6, d6, d29 @fract values in d1/ idx values in d0
+ vmov.i8 d29, #2 @contains #2 for adding to get ref_main_idx + 1
+
+ movw r0,#0x302 @ idx value for v is +1 of u
+ vdup.u16 d27,r0
+ mov r0,#0
+
+ vmov.i8 d9, #22 @row 0 to 7
+
+ vsub.s8 d8, d8, d27 @ref_main_idx (sub row)
+ vsub.s8 d8, d26, d8 @ref_main_idx (row 0)
+ vadd.s8 d8, d8, d9 @to compensate the pu1_src idx incremented by 8
+ vsub.s8 d9, d8, d29 @ref_main_idx + 1 (row 0)
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 0)
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 0)
+ vsub.s8 d4, d8, d29 @ref_main_idx (row 1)
+ vsub.s8 d5, d9, d29 @ref_main_idx + 1 (row 1)
+
+ vmov.i8 d29, #4
+
+ vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 1)
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 1)
+ vsub.s8 d8, d8, d29 @ref_main_idx (row 2)
+ vsub.s8 d9, d9, d29 @ref_main_idx + 1 (row 2)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 2)
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 2)
+ vsub.s8 d4, d4, d29 @ref_main_idx (row 3)
+ vsub.s8 d5, d5, d29 @ref_main_idx + 1 (row 3)
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 3)
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 3)
+ vsub.s8 d8, d8, d29 @ref_main_idx (row 4)
+ vsub.s8 d9, d9, d29 @ref_main_idx + 1 (row 4)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 4)
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 4)
+ vsub.s8 d4, d4, d29 @ref_main_idx (row 5)
+ vsub.s8 d5, d5, d29 @ref_main_idx + 1 (row 5)
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+
+ vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 5)
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 5)
+ vsub.s8 d8, d8, d29 @ref_main_idx (row 6)
+ vsub.s8 d9, d9, d29 @ref_main_idx + 1 (row 6)
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ cmp r4,#4
+ beq end_func
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 6)
+ vmull.u8 q11, d16, d7 @mul (row 5)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 6)
+ vsub.s8 d4, d4, d29 @ref_main_idx (row 7)
+ vsub.s8 d5, d5, d29 @ref_main_idx + 1 (row 7)
+
+ vst1.8 d24, [r2], r3 @st (row 4)
+ vrshrn.i16 d22, q11, #5 @round shft (row 5)
+
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7)
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 7)
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d22, [r2], r3 @st (row 5)
+ vrshrn.i16 d20, q10, #5 @round shft (row 6)
+ vrshrn.i16 d18, q9, #5 @round shft (row 7)
+
+ vst1.8 d20, [r2], r3 @st (row 6)
+
+ subs r10, r10, #4 @subtract 8 and go to end if 8x8
+
+ vst1.8 d18, [r2], r3 @st (row 7)
+
+ beq end_func
+
+ subs r11, r11, #8 @decrement the processed col
+ addgt r8, r8, #4
+ addgt r2, r2, r7
+ movle r8, r12
+ suble r2, r2, r4
+ addle r2, r2, #8
+ movle r11, r4, lsl #1
+ ldrle r14, col_for_intra_chroma_addr_2
+ulbl5:
+ addle r14,r14,pc
+ addle r0, r0, #8
+
+ vld1.8 d31, [r14]!
+ vmull.s8 q6, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ vmovn.s16 d10, q6
+ vshr.s16 q6, q6, #5
+ vqmovn.s16 d11, q6
+ vshl.s8 d11, d11, #1
+ movw r5, #0x302 @idx value for v is +1 of u
+ vdup.u16 d27, r5 @row value inc or reset accordingly
+ ldr r9, [r8] @loads index value
+ mov r9, r9, lsl #1
+ mov r5, #22
+ sub r5, r5, r0, lsl #1
+ vdup.8 d16, r5
+ vdup.8 d26, r9
+
+ mov r5,r2
+ vsub.s8 d11, d11, d27 @ref_main_idx (sub row)
+
+kernel_8_16_32:
+ vmov.i8 d29, #2 @contains #2 for adding to get ref_main_idx + 1
+ vsub.s8 d8, d26, d11 @ref_main_idx
+ vmov d26,d10
+
+ subs r11, r11, #8
+ sub r6, r1, r9
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7)
+ vadd.s8 d8, d8, d16 @to compensate the pu1_src idx incremented by 8
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx - 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ addle r0, r0, #8
+ vsub.s8 d9, d8, d29 @ref_main_idx - 2
+ addgt r8, r8, #4
+
+ vld1.8 {d0,d1,d2,d3}, [r6] @stores the 32 values reqd based on indices values (from most idx)
+ vrshrn.i16 d22, q11, #5 @round shft (row 5)
+
+ ldrle r14, col_for_intra_chroma_addr_3
+ulbl6:
+ addle r14,r14,pc
+ vst1.8 d24, [r5], r3 @st (row 4)
+ movle r8, r12
+
+ movw r9,#0x302
+ vdup.16 d27, r9 @row value inc or reset accordingly
+ vsub.s8 d4, d8, d29 @ref_main_idx (row 1)
+
+ vsub.s8 d5, d9, d29 @ref_main_idx - 1 (row 1)
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 0)
+ vmov.i8 d29, #31 @contains #2 for adding to get ref_main_idx + 1
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 0)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vld1.8 d31, [r14]!
+ vand d6, d29, d26 @fract values in d1/ idx values in d0
+
+ movle r11, r4, lsl #1
+ vmov.i8 d29, #4 @contains #2 for adding to get ref_main_idx + 1
+ ldr r9, [r8]
+
+ vst1.8 d22, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vsub.s8 d8, d8, d29 @ref_main_idx (row 2)
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 1)
+ vsub.s8 d9, d9, d29 @ref_main_idx - 1 (row 2)
+
+ mov r9,r9,lsl #1
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 1)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vsub.s8 d4, d4, d29 @ref_main_idx (row 3)
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 2)
+ vsub.s8 d5, d5, d29 @ref_main_idx - 1 (row 3)
+
+ vmull.u8 q11, d10, d7 @mul (row 1)
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 2)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+ vst1.8 d18, [r5], r3 @(from previous loop)st (row 7)
+
+ vsub.s8 d8, d8, d29 @ref_main_idx (row 4)
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 3)
+ vsub.s8 d9, d9, d29 @ref_main_idx - 1 (row 4)
+
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 3)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ add r5,r2,r3,lsl#2
+ vmull.s8 q7, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ add r9, r9, r0, lsl #1
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vsub.s8 d4, d4, d29 @ref_main_idx (row 5)
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 4)
+ vsub.s8 d5, d5, d29 @ref_main_idx - 1 (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 4)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vmovn.s16 d10, q7
+ vshr.s16 q7, q7, #5
+
+ vsub.s8 d8, d8, d29 @ref_main_idx (row 6)
+ vtbl.8 d21, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 5)
+ vsub.s8 d9, d9, d29 @ref_main_idx - 1 (row 6)
+
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 5)
+ vqmovn.s16 d11, q7
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+ vdup.8 d26, r9
+
+ vsub.s8 d4, d4, d29 @ref_main_idx (row 7)
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 6)
+ vsub.s8 d5, d5, d29 @ref_main_idx - 1 (row 7)
+
+ mov r6, #22 @to compensate the 2*row value
+ vshl.u8 d11,#1
+ sub r6, r6, r0, lsl #1
+
+ vmull.u8 q11, d21, d7 @mul (row 5)
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 6)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+ add r2,r2,r3, lsl #2
+ vdup.8 d16, r6
+ addgt r2, r7, r2
+
+ suble r2, r2, r4
+ vsub.s8 d11, d11, d27 @ref_main_idx (add row)
+ suble r2,r2,#8
+
+ subs r10, r10, #4 @subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+
+epil_8_16_32:
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7)
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vst1.8 d24, [r5], r3 @st (row 4)
+ vrshrn.i16 d24, q11, #5 @round shft (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d24, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vst1.8 d18, [r5], r3 @st (row 7)
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_planar.s b/common/arm/ihevc_intra_pred_chroma_planar.s
new file mode 100644
index 0000000..30b3144
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_planar.s
@@ -0,0 +1,363 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_filters_planar.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for planar input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode,
+@ word32 pi1_coeff)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_planar_a9q
+.extern gau1_ihevc_planar_factor
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl1 - 8
+
+.type ihevc_intra_pred_chroma_planar_a9q, %function
+
+ihevc_intra_pred_chroma_planar_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
+ulbl1:
+ add r11,r11,pc
+
+ clz r5, r4
+ rsb r5, r5, #32
+ vdup.16 q7, r5
+ vneg.s16 q7, q7 @shr value (so vneg)
+ vdup.8 d2, r4 @nt
+ vdup.s16 q8, r4 @nt
+
+ sub r6, r4, #1 @nt-1
+ add r6, r0,r6,lsl #1 @2*(nt-1)
+ ldr r7, [r6]
+ vdup.s16 d0, r7 @src[nt-1]
+
+ add r6, r4, r4,lsl #1 @3nt
+ add r6, r6, #1 @3nt + 1
+ lsl r6,r6,#1 @2*(3nt + 1)
+
+ add r6, r6, r0
+ ldr r7, [r6]
+ vdup.s16 d1, r7 @src[3nt+1]
+
+
+ add r6, r4, r4 @2nt
+ add r14, r6, #1 @2nt+1
+ lsl r14,#1 @2*(2nt+1)
+ sub r6, r6, #1 @2nt-1
+ lsl r6,#1 @2*(2nt-1)
+ add r6, r6, r0 @&src[2nt-1]
+ add r14, r14, r0 @&src[2nt+1]
+
+ mov r8, #1 @row+1 (row is first 0)
+ sub r9, r4, r8 @nt-1-row (row is first 0)
+
+ vdup.s8 d5, r8 @row + 1
+ vdup.s8 d6, r9 @nt - 1 - row
+ vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+ add r12, r11, #1 @coeffs (to be reloaded after every row)
+ mov r1, r4 @nt (row counter) (dec after every row)
+ mov r5, r2 @dst (to be reloaded after every row and inc by dst_strd)
+ mov r10, #8 @increment for the coeffs
+ mov r0, r14 @&src[2nt+1] (to be reloaded after every row)
+
+ cmp r4, #4
+ beq tf_sz_4
+
+
+
+ mov r10,r6
+tf_sz_8_16:
+ vld1.s8 {d10,d11}, [r14]! @load src[2nt+1+col]
+ vld1.s8 d8, [r12]!
+ vmov d9,d8
+ vzip.8 d8,d9
+ vsub.s8 d30, d2, d8 @[nt-1-col]
+ vsub.s8 d31, d2, d9
+
+
+
+
+loop_sz_8_16:
+
+ ldr r7, [r6], #-2 @src[2nt-1-row] (dec to take into account row)
+ vmull.u8 q6, d5, d0 @(row+1) * src[nt-1]
+ ldr r11, [r6], #-2 @src[2nt-1-row] (dec to take into account row)
+ vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col]
+ vdup.s16 d4, r7 @src[2nt-1-row]
+ vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1]
+ vdup.s16 d3, r11 @src[2nt-1-row]
+ vmlal.u8 q6, d30, d4 @(nt-1-col) * src[2nt-1-row]
+
+
+
+ vmull.u8 q14,d5,d0
+ ldr r7, [r6], #-2 @src[2nt-1-row] (dec to take into account row)
+ vmlal.u8 q14,d6,d11
+ vadd.s8 d18, d5, d7 @row++ [(row+1)++]c
+
+
+ vmlal.u8 q14,d31,d4
+ vsub.s8 d19, d6, d7 @[nt-1-row]--
+ vmlal.u8 q14,d9,d1
+ vdup.s16 d4, r7 @src[2nt-1-row]
+
+ vmull.u8 q13, d18, d0 @(row+1) * src[nt-1]
+ vadd.i16 q6, q6, q8 @add (nt)
+ vmlal.u8 q13, d19, d10 @(nt-1-row) * src[2nt+1+col]
+ vshl.s16 q6, q6, q7 @shr
+ vmlal.u8 q13, d8, d1 @(col+1) * src[3nt+1]
+ vadd.i16 q14,q14,q8
+ vmlal.u8 q13, d30, d3 @(nt-1-col) * src[2nt-1-row]
+ vshl.s16 q14,q14,q7
+
+
+
+
+
+ vmull.u8 q12,d18,d0
+ vadd.s8 d5, d18, d7 @row++ [(row+1)++]
+ vmlal.u8 q12,d19,d11
+ vsub.s8 d6, d19, d7 @[nt-1-row]--
+ vmlal.u8 q12,d9,d1
+ vmovn.i16 d12, q6
+ vmlal.u8 q12,d31,d3
+ vmovn.i16 d13,q14
+
+
+
+
+ vadd.i16 q13, q13, q8 @add (nt)
+ vmull.u8 q11, d5, d0 @(row+1) * src[nt-1]
+ vshl.s16 q13, q13, q7 @shr
+ vmlal.u8 q11, d6, d10 @(nt-1-row) * src[2nt+1+col]
+ vst1.s32 {d12,d13}, [r2], r3
+ vmlal.u8 q11, d8, d1 @(col+1) * src[3nt+1]
+ vadd.i16 q12,q12,q8
+ vmlal.u8 q11, d30, d4 @(nt-1-col) * src[2nt-1-row]
+ vshl.s16 q12,q12,q7
+
+ vmull.u8 q10,d5,d0
+ vadd.s8 d18, d5, d7 @row++ [(row+1)++]c
+ vmlal.u8 q10,d6,d11
+ vsub.s8 d19, d6, d7 @[nt-1-row]--
+ vmlal.u8 q10,d31,d4
+
+ ldr r11, [r6], #-2 @src[2nt-1-row] (dec to take into account row)
+ vmlal.u8 q10,d9,d1
+ vdup.s16 d3, r11 @src[2nt-1-row]
+ vadd.i16 q11, q11, q8 @add (nt)
+
+ vmull.u8 q6, d18, d0 @(row+1) * src[nt-1]
+ vmovn.i16 d26, q13
+ vmlal.u8 q6, d19, d10 @(nt-1-row) * src[2nt+1+col]
+ vmovn.i16 d27,q12
+
+ vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1]
+ vshl.s16 q11, q11, q7 @shr
+
+ vmlal.u8 q6, d30, d3 @(nt-1-col) * src[2nt-1-row]
+ vadd.i16 q10,q10,q8
+
+ vmull.u8 q14,d18,d0
+ vst1.s32 {d26,d27}, [r2], r3
+
+ vmlal.u8 q14,d19,d11
+ vadd.s8 d5, d18, d7 @row++ [(row+1)++]
+
+ vsub.s8 d6, d19, d7 @[nt-1-row]--
+ vmlal.u8 q14,d9,d1
+
+ vmlal.u8 q14,d31,d3
+ vshl.s16 q10,q10,q7
+
+
+ vadd.i16 q6, q6 ,q8 @add (nt)
+ vmovn.i16 d22, q11
+
+
+ vadd.i16 q14,q14,q8
+ vmovn.i16 d23,q10
+
+
+ vshl.s16 q6, q6, q7 @shr
+ vst1.s32 {d22,d23}, [r2], r3
+ vshl.s16 q14,q14,q7
+
+
+
+
+
+ vmovn.i16 d20, q6
+ vmovn.i16 d21,q14
+
+ vst1.s32 {d20,d21}, [r2], r3
+
+
+ subs r1, r1, #4
+
+ bne loop_sz_8_16
+
+
+
+
+ cmp r4,#16
+
+ bne end_loop
+
+
+ sub r4,#16
+ vdup.s8 d5, r8 @row + 1
+ vdup.s8 d6, r9 @nt - 1 - row
+ vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+ mov r6,r10
+ mov r1,#16
+ sub r2,r2,r3,lsl #4
+ add r2,r2,#16
+
+ vld1.s8 {d10,d11}, [r14]! @load src[2nt+1+col]
+ vld1.s8 d8, [r12]!
+ vmov d9,d8
+ vzip.8 d8,d9
+ vsub.s8 d30, d2, d8 @[nt-1-col]
+ vsub.s8 d31, d2, d9
+
+ beq loop_sz_8_16
+
+
+
+tf_sz_4:
+ vld1.s8 d10, [r14] @load src[2nt+1+col]
+ vld1.s8 d8, [r12], r10 @load 8 coeffs [col+1]
+ vmov d9,d8
+ vzip.8 d8,d9
+loop_sz_4:
+ @mov r10, #4 @reduce inc to #4 for 4x4
+ ldr r7, [r6], #-2 @src[2nt-1-row] (dec to take into account row)
+ vdup.s16 d4, r7 @src[2nt-1-row]
+
+ vsub.s8 d9, d2, d8 @[nt-1-col]
+
+ vmull.u8 q6, d5, d0 @(row+1) * src[nt-1]
+ vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col]
+ vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1]
+ vmlal.u8 q6, d9, d4 @(nt-1-col) * src[2nt-1-row]
+@ vadd.i16 q6, q6, q8 @add (nt)
+@ vshl.s16 q6, q6, q7 @shr
+@ vmovn.i16 d12, q6
+ vrshrn.s16 d12,q6,#3
+
+ vst1.s32 {d12}, [r2], r3
+
+ vadd.s8 d5, d5, d7 @row++ [(row+1)++]
+ vsub.s8 d6, d6, d7 @[nt-1-row]--
+ subs r1, r1, #1
+
+ bne loop_sz_4
+
+end_loop:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_chroma_ver.s b/common/arm/ihevc_intra_pred_chroma_ver.s
new file mode 100644
index 0000000..b68a045
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_chroma_ver.s
@@ -0,0 +1,229 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_chroma_ver_neon.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_ver_a9q
+
+.type ihevc_intra_pred_chroma_ver_a9q, %function
+
+ihevc_intra_pred_chroma_ver_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ lsl r5, r4, #2 @4nt
+
+
+ cmp r4, #8
+ beq blk_8
+ blt blk_4
+
+copy_16:
+ add r5, r5, #2 @2nt+2
+ add r6, r0, r5 @&src[2nt+1]
+
+ add r5, r2, r3 @pu1_dst + dst_strd
+ vld2.8 {d20,d21}, [r6]! @16 loads (col 0:15)
+ add r8, r5, r3
+
+ add r10, r8, r3
+ vld2.8 {d22,d23}, [r6] @16 loads (col 16:31)
+ lsl r11, r3, #2
+
+ add r11, r11, #0xfffffff0
+
+
+ vst2.8 {d20,d21}, [r2]!
+ vst2.8 {d20,d21}, [r5]!
+ vst2.8 {d20,d21}, [r8]!
+ vst2.8 {d20,d21}, [r10]!
+
+ vst2.8 {d22,d23}, [r2], r11
+ vst2.8 {d22,d23}, [r5], r11
+ vst2.8 {d22,d23}, [r8], r11
+ vst2.8 {d22,d23}, [r10], r11
+
+ subs r4, r4, #4
+
+kernel_copy_16:
+ vst2.8 {d20,d21}, [r2]!
+ vst2.8 {d20,d21}, [r5]!
+ vst2.8 {d20,d21}, [r8]!
+ vst2.8 {d20,d21}, [r10]!
+
+ vst2.8 {d22,d23}, [r2], r11
+ vst2.8 {d22,d23}, [r5], r11
+ vst2.8 {d22,d23}, [r8], r11
+ vst2.8 {d22,d23}, [r10], r11
+
+ subs r4, r4, #4
+
+
+ vst2.8 {d20,d21}, [r2]!
+ vst2.8 {d20,d21}, [r5]!
+ vst2.8 {d20,d21}, [r8]!
+ vst2.8 {d20,d21}, [r10]!
+
+ vst2.8 {d22,d23}, [r2], r11
+ vst2.8 {d22,d23}, [r5], r11
+ vst2.8 {d22,d23}, [r8], r11
+ vst2.8 {d22,d23}, [r10], r11
+
+ subs r4, r4, #4
+
+ vst2.8 {d20,d21}, [r2]!
+ vst2.8 {d20,d21}, [r5]!
+ vst2.8 {d20,d21}, [r8]!
+ vst2.8 {d20,d21}, [r10]!
+
+ vst2.8 {d22,d23}, [r2], r11
+ vst2.8 {d22,d23}, [r5], r11
+ vst2.8 {d22,d23}, [r8], r11
+ vst2.8 {d22,d23}, [r10], r11
+
+ subs r4, r4, #4
+ bne kernel_copy_16
+
+ b end_func
+
+blk_8:
+
+ add r5, r5, #2 @2nt+2
+ add r6, r0, r5 @&src[2nt+1]
+
+ add r5, r2, r3 @pu1_dst + dst_strd
+ vld2.8 {d20,d21}, [r6]! @16 loads (col 0:15)
+ add r8, r5, r3
+
+ add r10, r8, r3
+ vld2.8 {d22,d23}, [r6] @16 loads (col 16:31)
+
+ lsl r11,r3,#2
+
+ vst2.8 {d20,d21}, [r2],r11
+ vst2.8 {d20,d21}, [r5],r11
+ vst2.8 {d20,d21}, [r8],r11
+ vst2.8 {d20,d21}, [r10],r11
+
+ vst2.8 {d20,d21}, [r2]
+ vst2.8 {d20,d21}, [r5]
+ vst2.8 {d20,d21}, [r8]
+ vst2.8 {d20,d21}, [r10]
+
+ subs r4, r4, #8
+ beq end_func
+
+blk_4:
+
+ @lsl r5, r4, #2 @4nt
+ add r5, r5, #2 @2nt+2
+ add r6, r0, r5 @&src[2nt+1]
+
+ vld1.8 {d0},[r6]
+ add r5, r2, r3 @pu1_dst + dst_strd
+
+ vst1.8 {d0},[r2]
+ add r8, r5, r3
+ vst1.8 {d0},[r5]
+ add r10, r8, r3
+ vst1.8 {d0},[r8]
+ vst1.8 {d0},[r10]
+
+
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
new file mode 100644
index 0000000..6c882cf
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@@ -0,0 +1,616 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_chroma_mode_11_to_17.s
+@*
+@* @brief
+@* contains function definitions for intra prediction chroma mode 11 to 17
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_11_to_17_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_11_17
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+idx_neg_idx_chroma_11_17_addr:
+.long idx_neg_idx_chroma_11_17 - ulbl3 - 8
+
+col_for_intra_chroma_addr_1:
+.long col_for_intra_chroma - ulbl4 - 8
+
+col_for_intra_chroma_addr_2:
+.long col_for_intra_chroma - ulbl5 - 8
+
+col_for_intra_chroma_addr_3:
+.long col_for_intra_chroma - ulbl6 - 8
+
+.type ihevc_intra_pred_chroma_mode_11_to_17_a9q, %function
+
+ihevc_intra_pred_chroma_mode_11_to_17_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r7, gai4_ihevc_ang_table_addr
+ulbl1:
+ add r7,r7,pc
+
+ ldr r5,[sp,#44] @mode (11 to 17)
+ ldr r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+ add r8,r8,pc
+
+ add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode]
+ add r8, r8, r5, lsl #2 @gai4_ihevc_inv_ang_table[mode - 11]
+ sub r8, r8, #44
+
+ ldr r7, [r7] @intra_pred_ang
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
+
+ ldr r8, [r8] @inv_ang
+ add r6, sp, r4, lsl #1 @ref_temp + 2 * nt
+
+ mul r9, r4, r7 @nt*intra_pred_ang
+
+ sub r6, r6, #2 @ref_temp + 2*nt - 2
+
+ add r1, r0, r4, lsl #2 @r1 = &src[4nt]
+ vdup.8 d30, r7 @intra_pred_ang
+
+ mov r7, r4
+
+ sub r1,r1,#6 @address calculation for copying 4 halfwords
+
+ asr r9, r9, #5
+
+ vld1.8 d0,[r1]
+ vrev64.16 d0,d0
+ vst1.8 d0,[r6]!
+
+ sub r1,#8
+
+ subs r7, r7, #4
+ addeq r1,#8
+ beq end_loop_copy
+ subs r7,r7,#4
+ beq loop_copy_8
+ subs r7,r7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ sub r1,#24
+ vld1.8 {d0,d1,d2,d3},[r1]
+
+ sub r1,#24
+ vld1.8 {d0,d1,d2,d3},[r1]!
+
+ vrev64.16 d6,d6
+ vrev64.16 d5,d5
+ vrev64.16 d4,d4
+ vrev64.16 d3,d3
+ vrev64.16 d2,d2
+ vrev64.16 d1,d1
+ vrev64.16 d0,d0
+
+ vst1.8 d6,[r6]!
+ vst1.8 d5,[r6]!
+ vst1.8 d4,[r6]!
+ vst1.8 d3,[r6]!
+ vst1.8 d2,[r6]!
+ vst1.8 d1,[r6]!
+ vst1.8 d0,[r6]!
+
+ vld1.8 {d4,d5,d6},[r1]!
+ b end_loop_copy
+
+loop_copy_16:
+ sub r1,#16
+ vld1.8 {d0,d1,d2},[r1]
+
+ vrev64.16 d2,d2
+ vrev64.16 d1,d1
+ vrev64.16 d0,d0
+
+ vst1.8 d2,[r6]!
+ vst1.8 d1,[r6]!
+ vst1.8 d0,[r6]!
+
+ b end_loop_copy
+loop_copy_8:
+ vld1.8 d0,[r1]
+ vrev64.16 d0,d0
+ vst1.8 d0,[r6]!
+end_loop_copy:
+ sub r1,#2
+
+ ldrh r11, [r1], #-2
+ strh r11, [r6], #2
+
+ cmp r9, #-1
+ bge prologue_8_16_32
+
+ add r6, sp, r4, lsl #1 @ref_temp + 2 * nt
+ sub r6, r6, #4 @ref_temp + 2 * nt - 2 - 2
+
+ mov r12, #0xffffffff
+
+ rsb r9, r9, r12 @count to take care off ref_idx
+
+ add r1, r0, r4, lsl #2 @r1 = &src[4nt]
+
+ mov r7, #128 @inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add r7, r7, r8 @inv_ang_sum += inv_ang
+
+ mov r0,r7, lsr #8
+ mov r0,r0, lsl #1
+
+ ldrh r11, [r1, r0]
+ strh r11, [r6], #-2
+
+ subs r9, r9, #1
+
+ bne loop_copy_ref_idx
+
+prologue_8_16_32:
+
+ ldr r14, col_for_intra_chroma_addr_1
+ulbl4:
+ add r14,r14,pc
+
+ lsr r10, r4, #3
+ vld1.8 d31, [r14]!
+ mul r10, r4, r10 @block counter (dec by #8)
+
+ mov r11, r4, lsl #1 @col counter to be inc/dec by #8
+ vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+
+ sub r7, r5, #11
+ ldr r12, idx_neg_idx_chroma_11_17_addr @load least idx table
+ulbl3:
+ add r12,r12,pc
+
+ add r12, r12, r7, lsl #4
+ mov r8, r12
+
+ mov r7, #8
+ sub r7, r7, r3, lsl #3 @r7 = 8-8r3
+
+ ldr r9, [r8]
+ mov r9,r9,lsl #1
+ add r1, sp, r4, lsl #1 @ref_temp + 2nt
+
+ vmovn.s16 d6, q11
+ vdup.8 d26, r9 @least idx added to final idx values
+ sub r1, r1, #2 @ref_temp + 2nt - 2
+
+ add r6, r1, r9
+
+ vld1.8 {d0,d1,d2,d3}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+ vshr.s16 q11, q11, #5
+
+@ mov r0, #31
+ vmov.i8 d29, #31 @contains #31 for vand operation
+
+@ mov r0, #32
+ vmov.i8 d28, #32
+
+ vqmovn.s16 d8, q11
+ vshl.s8 d8, d8, #1 @ 2 * idx
+
+ vand d6, d6, d29 @fract values in d1/ idx values in d0
+
+@ mov r0, #2
+ vmov.i8 d29, #2 @contains #2 for adding to get ref_main_idx + 1
+
+ mov r0,#0x100 @ idx value for v is +1 of u
+ vdup.u16 d27,r0
+ vadd.u8 d27,d27,d29
+ mov r0,#0
+
+ vadd.s8 d8, d8, d27 @ref_main_idx (add row)
+ vsub.s8 d8, d8, d26 @ref_main_idx (row 0)
+ vadd.s8 d9, d8, d29 @ref_main_idx + 1 (row 0)
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 0)
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 0)
+ vadd.s8 d4, d8, d29 @ref_main_idx (row 1)
+ vadd.s8 d5, d9, d29 @ref_main_idx + 1 (row 1)
+
+@ mov r0, #4 @ 2 *(row * 2 )
+ vmov.i8 d29, #4
+
+ vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 1)
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 1)
+ vadd.s8 d8, d8, d29 @ref_main_idx (row 2)
+ vadd.s8 d9, d9, d29 @ref_main_idx + 1 (row 2)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 2)
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 2)
+ vadd.s8 d4, d4, d29 @ref_main_idx (row 3)
+ vadd.s8 d5, d5, d29 @ref_main_idx + 1 (row 3)
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 3)
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 3)
+ vadd.s8 d8, d8, d29 @ref_main_idx (row 4)
+ vadd.s8 d9, d9, d29 @ref_main_idx + 1 (row 4)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 4)
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 4)
+ vadd.s8 d4, d4, d29 @ref_main_idx (row 5)
+ vadd.s8 d5, d5, d29 @ref_main_idx + 1 (row 5)
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+
+ vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 5)
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 5)
+ vadd.s8 d8, d8, d29 @ref_main_idx (row 6)
+ vadd.s8 d9, d9, d29 @ref_main_idx + 1 (row 6)
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ cmp r4,#4
+ beq end_func
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 6)
+ vmull.u8 q11, d16, d7 @mul (row 5)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 6)
+ vadd.s8 d4, d4, d29 @ref_main_idx (row 7)
+ vadd.s8 d5, d5, d29 @ref_main_idx + 1 (row 7)
+
+ vst1.8 d24, [r2], r3 @st (row 4)
+ vrshrn.i16 d22, q11, #5 @round shft (row 5)
+
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7)
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 7)
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d22, [r2], r3 @st (row 5)
+ vrshrn.i16 d20, q10, #5 @round shft (row 6)
+ vrshrn.i16 d18, q9, #5 @round shft (row 7)
+
+ vst1.8 d20, [r2], r3 @st (row 6)
+
+ subs r10, r10, #4 @subtract 8 and go to end if 8x8
+
+ vst1.8 d18, [r2], r3 @st (row 7)
+
+ beq end_func
+
+ subs r11, r11, #8
+ addgt r8, r8, #4
+ addgt r2, r2, r7
+ movle r8, r12
+ suble r2, r2, r4
+ addle r2, r2, #8
+ movle r11, r4, lsl #1
+ ldrle r14, col_for_intra_chroma_addr_2
+ulbl5:
+ addle r14,r14,pc
+ addle r0, r0, #8
+
+ vld1.8 d31, [r14]!
+ vmull.s8 q6, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ vmovn.s16 d10, q6
+ vshr.s16 q6, q6, #5
+ vqmovn.s16 d11, q6
+ vshl.s8 d11, d11, #1
+ orr r5,r0,r0, lsl#8
+ add r5,#0x002
+ add r5,#0x300
+ vdup.u16 d27, r5 @row value inc or reset accordingly
+ ldr r9, [r8]
+ mov r9,r9,lsl #1
+ add r9, r9, r0, lsl #1
+@ sub r9, r9, #1
+ vdup.8 d26, r9
+ vadd.s8 d8, d27, d11 @ref_main_idx (add row)
+ mov r5,r2
+
+@ sub r4,r4,#8
+
+kernel_8_16_32:
+ vmov.i8 d29, #2 @contains #2 for adding to get ref_main_idx + 1
+
+ vsub.s8 d8, d8, d26 @ref_main_idx
+ vmov d26,d10
+
+ subs r11, r11, #8
+ add r6, r1, r9
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7)
+ vadd.s8 d9, d29, d8 @ref_main_idx + 1
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ addle r0, r0, #8
+ addgt r8, r8, #4
+ vld1.8 {d0,d1,d2,d3}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+
+ vst1.8 d24, [r5], r3 @st (row 4)
+ vrshrn.i16 d24, q11, #5 @round shft (row 5)
+
+ movle r8, r12
+ orr r9,r0,r0, lsl#8
+ mov r9,r9,lsl #1
+ add r9,#0x002
+ add r9,#0x300
+ vdup.u16 d27, r9 @row value inc or reset accordingly
+
+ ldrle r14, col_for_intra_chroma_addr_3
+ulbl6:
+ addle r14,r14,pc
+
+ vadd.s8 d4, d29, d8 @ref_main_idx (row 1)
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 0)
+ vadd.s8 d5, d29, d9 @ref_main_idx + 1 (row 1)
+
+ vmov.i8 d29, #31 @contains #2 for adding to get ref_main_idx + 1
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 0)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vld1.8 d31, [r14]!
+ vand d6, d29, d26 @fract values in d1/ idx values in d0
+
+ vmov.i8 d29, #4 @contains #2 for adding to get ref_main_idx + 1
+
+ vst1.8 d24, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vadd.s8 d8, d29, d8 @ref_main_idx (row 2)
+ vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 1)
+ vadd.s8 d9, d29, d9 @ref_main_idx + 1 (row 2)
+
+ movle r11, r4,lsl #1
+ ldr r9, [r8]
+ mov r9,r9,lsl #1
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 1)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vadd.s8 d4, d4, d29 @ref_main_idx (row 3)
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 2)
+ vadd.s8 d5, d5, d29 @ref_main_idx + 1 (row 3)
+
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 2)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+ vst1.8 d18, [r5], r3 @(from previous loop)st (row 7)
+
+ vadd.s8 d8, d8, d29 @ref_main_idx (row 4)
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 3)
+ vadd.s8 d9, d9, d29 @ref_main_idx + 1 (row 4)
+
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 3)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ vmull.s8 q7, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ add r5,r2,r3,lsl#2
+ add r9, r9, r0, lsl #1
+
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vadd.s8 d4, d4, d29 @ref_main_idx (row 5)
+ vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 4)
+ vadd.s8 d5, d5, d29 @ref_main_idx + 1 (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 4)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vmovn.s16 d10, q7
+ vshr.s16 q7, q7, #5
+
+ vadd.s8 d8, d8, d29 @ref_main_idx (row 6)
+ vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 5)
+ vadd.s8 d9, d9, d29 @ref_main_idx + 1 (row 6)
+
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 5)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+
+@ sub r9, r9, #1
+ vqmovn.s16 d11, q7
+
+ vadd.s8 d4, d4, d29 @ref_main_idx (row 7)
+ vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 6)
+ vadd.s8 d5, d5, d29 @ref_main_idx + 1 (row 7)
+
+ vshl.u8 d11,#1
+
+ vmull.u8 q11, d16, d7 @mul (row 5)
+ vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 6)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vadd.s8 d8, d27, d11 @ref_main_idx (add row)
+ vdup.8 d26, r9
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+
+ add r2,r3, lsl #2
+ addgt r2, r7, r2
+ suble r2, r2, r4, lsl #1
+ addle r2,r2,#8
+
+ subs r10, r10, #4 @subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+epil_8_16_32:
+
+ vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7)
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vst1.8 d24, [r5], r3 @st (row 4)
+ vrshrn.i16 d24, q11, #5 @round shft (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d24, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vst1.8 d18, [r5], r3 @st (row 7)
+
+end_func:
+ add sp, sp, #132
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
new file mode 100644
index 0000000..2ede914
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@@ -0,0 +1,571 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_chroma_mode_19_to_25.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* naveen sr
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* chroma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_chroma_mode_19_to_25(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_chroma_mode_19_to_25_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl2 - 8
+
+gai4_ihevc_ang_table_addr_1:
+.long gai4_ihevc_ang_table - ulbl3 - 8
+
+gai4_ihevc_ang_table_addr_2:
+.long gai4_ihevc_ang_table - ulbl4 - 8
+
+.type ihevc_intra_pred_chroma_mode_19_to_25_a9q, %function
+
+ihevc_intra_pred_chroma_mode_19_to_25_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r7, gai4_ihevc_ang_table_addr_1
+ulbl3:
+ add r7,r7,pc
+
+ ldr r5,[sp,#44] @mode (19 to 25)
+ ldr r8, gai4_ihevc_inv_ang_table_addr
+ulbl1:
+ add r8,r8,pc
+
+ add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode]
+ add r8, r8, r5, lsl #2 @gai4_ihevc_inv_ang_table
+ sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12]
+
+ ldr r7, [r7] @intra_pred_ang
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2]
+
+ ldr r8, [r8] @inv_ang
+ add r6, sp, r4 , lsl #1 @ref_temp + 2 * nt
+
+ mul r9, r4, r7 @nt*intra_pred_ang
+
+ sub r6, r6, #2 @ref_temp + 2*nt - 2
+
+ add r1, r0, r4, lsl #2 @r1 = &src[4nt]
+ vdup.8 d30, r7 @intra_pred_ang
+
+ mov r7, r4
+
+ asr r9, r9, #5
+
+ vld1.32 d0,[r1]! @ pu1_ref[two_nt + k]
+
+ vst1.32 d0,[r6]! @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
+
+ subs r7, r7, #4
+ beq end_loop_copy
+ subs r7,r7,#4
+ beq loop_copy_8
+ subs r7,r7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ vld1.8 {d0,d1,d2,d3},[r1]!
+ vld1.8 {d4,d5,d6},[r1]!
+
+ vst1.8 {d0,d1,d2,d3},[r6]!
+
+
+ vst1.8 {d4,d5,d6},[r6]!
+ b end_loop_copy
+
+loop_copy_16:
+ vld1.8 {d0,d1,d2},[r1]!
+ vst1.8 {d0,d1,d2},[r6]!
+
+ b end_loop_copy
+
+loop_copy_8:
+ vld1.8 d0,[r1]!
+ vst1.8 d0,[r6]!
+
+end_loop_copy:
+
+ ldrh r11, [r1]
+ strh r11, [r6]
+
+ cmp r9, #-1
+ bge linear_filtering
+
+ add r6, sp, r4 ,lsl #1 @ref_temp + 2 * nt
+ sub r6, r6, #4 @ref_temp + 2 * nt - 2 - 2
+
+ mov r12, #0xffffffff
+
+ rsb r9, r9, r12 @count to take care off ref_idx
+
+ add r1, r0, r4, lsl #2 @r1 = &src[2nt]
+
+ mov r7, #128 @inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add r7, r7, r8 @inv_ang_sum += inv_ang
+ mov r0,r7, lsr #8
+ mov r0,r0, lsl #1
+ ldrh r11, [r1, -r0]
+ strh r11, [r6], #-2
+
+ subs r9, r9, #1
+
+ bne loop_copy_ref_idx
+
+
+linear_filtering:
+@ after copy
+@ below code is taken from mode 27 to 33 and modified
+
+ ldr r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
+ulbl4:
+ add r6,r6,pc
+
+ lsl r7,r4,#2 @four_nt
+
+ add r8,r6,r5,lsl #2 @*gai4_ihevc_ang_table[mode]
+ ldr r9,[r8] @intra_pred_ang = gai4_ihevc_ang_table[mode]
+ ldr r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+ add r1,r1,pc
+ add r6,r1,#1
+
+ add r8, sp, r4, lsl #1 @ref_temp + 2 * nt
+ sub r8,#2 @ref_temp + 2*nt -2
+
+ mov lr,#0 @row
+ mov r12,r4
+ lsl r4,r4,#1
+
+core_loop_8:
+ add r8,r8,#2 @pu1_ref_main_idx += (four_nt + 1)
+ vdup.8 d0,r9 @intra_pred_ang
+ mov r12,r4,lsr #4 @divide by 8
+
+ vmov.i8 d1,#32
+ mul r7,r4,r12
+
+ vmov.i16 q3,#31
+
+
+ mov r1,r8
+
+ mov r5,r4
+ mov r11,#2
+
+prologue:
+ vld1.8 {d3},[r6] @loads the row value
+ vmull.s8 q1,d3,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.s16 d5,q1,#5 @idx = pos >> 5
+ vshl.s8 d5,d5,#1
+
+ vdup.8 d31,d4[0]
+ add r0,r2,r3
+
+ vmov.u32 lr,d5[0] @(i row)extract idx to the r register
+@ lsl lr,lr,#1
+
+ vdup.8 d29,d4[1] @(ii)
+ sbfx r9,lr,#0,#8
+
+ add r10,r8,r9 @(i row)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d8},[r10],r11 @(i row)ref_main_idx
+ sbfx r9,lr,#8,#8
+
+ vld1.8 {d9},[r10] @(i row)ref_main_idx_1
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ sbfx r9,lr,#16,#8
+ vsub.u8 d30,d1,d31 @32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ sbfx r9,lr,#24,#8
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+
+ vdup.8 d31,d4[4] @(v)
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vmov.u32 lr,d5[1] @extract idx to the r register
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+@ lsl lr,lr,#1
+
+ vst1.8 {d10},[r2]! @(i row)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx r9,lr,#0,#8
+ vdup.8 d29,d4[5] @(vi)
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ sbfx r9,lr,#8,#8
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+ vdup.8 d27,d4[6] @(vii)
+
+ sbfx r9,lr,#16,#8
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d25,d4[7] @(viii)
+ sbfx r9,lr,#24,#8
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs r7,r7,#8
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ cmp r4,#8 @ go to end if 4x4
+ beq end_loops
+
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ addgt r8,r8,#8
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subgt r4,r4,#8
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmull.s8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.s16 d3,q1,#5 @idx = pos >> 5
+ vshl.s8 d3,d3,#1
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+@ lsl lr,lr,#1
+ sbfx r9,lr,#0,#8
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ vdup.8 d31,d4[0]
+ subs r4,r4,#8
+ sbfx r9,lr,#8,#8
+
+ vld1.8 {d8},[r10],r11 @(i)ref_main_idx
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+
+ addle r6,r6,#8 @increment the row value
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d9},[r10] @(i)ref_main_idx_1
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vdup.8 d29,d4[1] @(ii)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx r9,lr,#16,#8
+
+ vst1.8 {d14},[r0],r3 @(vi)
+ vsub.u8 d30,d1,d31 @(i)32-fract(dup_const_32_fract)
+
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx r9,lr,#24,#8
+ movle r4,r5 @reload nt
+
+ vmov.u32 lr,d3[1] @extract idx to the r register
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.s8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+
+ vst1.8 {d22},[r0] @(viii)
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+@ lsl lr,lr,#1
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx r9,lr,#0,#8
+ add r0,r2,r3
+
+ vdup.8 d31,d4[4] @(v)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+ sbfx r9,lr,#8,#8
+
+ vst1.8 {d10},[r2]! @(i)
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+
+ vdup.8 d29,d4[5] @(vi)
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vdup.8 d27,d4[6] @(vii)
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+ sbfx r9,lr,#16,#8
+
+ vdup.8 d25,d4[7] @(viii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vshrn.s16 d3,q1,#5 @idx = pos >> 5
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+ sbfx r9,lr,#24,#8
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+
+ vshl.s8 d3,d3,#1
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+ movle r8,r1 @reload the source to pu1_src+2nt
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+
+ addgt r8,r8,#8 @increment the source next set 8 columns in same row
+ lslle r12,r3,#3
+ suble r12,r12,r5
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ addle r2,r2,r12 @increment the dst pointer to 8*dst_strd - nt
+ sbfx r9,lr,#0,#8
+
+ vmovn.i16 d4,q2
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+@ lsl lr,lr,#1
+
+ subs r7,r7,#8
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ vst1.8 {d14},[r0],r3 @(vi)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vst1.8 {d22},[r0],r3 @(viii)
+ b end_loops
+
+core_loop_4:
+
+end_loops:
+ add sp, sp, #132
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
new file mode 100644
index 0000000..93495f8
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -0,0 +1,693 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode_11_to_17.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_11_to_17_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_11_17
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+idx_neg_idx_11_17_addr_1:
+.long idx_neg_idx_11_17 - ulbl3 - 8
+
+idx_neg_idx_11_17_addr_2:
+.long idx_neg_idx_11_17 - ulbl4 - 8
+
+col_for_intra_luma_addr_1:
+.long col_for_intra_luma - ulbl_1 - 8
+
+col_for_intra_luma_addr_2:
+.long col_for_intra_luma - ulbl_2 - 8
+
+col_for_intra_luma_addr_3:
+.long col_for_intra_luma - ulbl_3 - 8
+
+col_for_intra_luma_addr_4:
+.long col_for_intra_luma - ulbl_4 - 8
+
+.type ihevc_intra_pred_luma_mode_11_to_17_a9q, %function
+
+ihevc_intra_pred_luma_mode_11_to_17_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r7, gai4_ihevc_ang_table_addr
+ulbl1:
+ add r7,r7,pc
+
+ ldr r5,[sp,#44] @mode (11 to 17)
+ ldr r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+ add r8,r8,pc
+
+ add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode]
+ add r8, r8, r5, lsl #2 @gai4_ihevc_inv_ang_table[mode - 11]
+ sub r8, r8, #44
+
+ ldr r7, [r7] @intra_pred_ang
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
+
+ ldr r8, [r8] @inv_ang
+ add r6, sp, r4 @ref_temp + nt
+
+ mul r9, r4, r7 @nt*intra_pred_ang
+
+ sub r6, r6, #1 @ref_temp + nt - 1
+
+ add r1, r0, r4, lsl #1 @r1 = &src[2nt]
+ vdup.8 d30, r7 @intra_pred_ang
+
+ mov r7, r4
+
+ ldrb r11, [r1], #-1
+
+ asr r9, r9, #5
+
+ ldrb r12, [r1], #-1
+ ldrb r10, [r1], #-1
+ ldrb r14, [r1], #-1
+
+ strb r11, [r6], #1
+ strb r12, [r6], #1
+ strb r10, [r6], #1
+ strb r14, [r6], #1
+
+ subs r7, r7, #4
+ beq end_loop_copy
+
+ sub r6,#4
+ sub r1,#3
+
+ subs r7,r7,#4
+ beq loop_copy_8
+ subs r7,r7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ vld1.8 d0,[r1]
+ sub r1,#8
+ vld1.8 d1,[r1]
+ sub r1,#8
+ vld1.8 d2,[r1]
+ sub r1,#8
+ vld1.8 d3,[r1]
+
+ vrev64.8 d0,d0
+ vrev64.8 d1,d1
+ vst1.8 d0,[r6]!
+ vrev64.8 d2,d2
+ vst1.8 d1,[r6]!
+ vrev64.8 d3,d3
+ vst1.8 d2,[r6]!
+ vst1.8 d3,[r6]!
+ sub r1,#1
+ b end_loop_copy
+
+loop_copy_16:
+ vld1.8 d0,[r1]
+ sub r1,#8
+ vld1.8 d1,[r1]
+
+ vrev64.8 d0,d0
+ vrev64.8 d1,d1
+
+ vst1.8 d0,[r6]!
+ vst1.8 d1,[r6]!
+ sub r1,#1
+ b end_loop_copy
+
+loop_copy_8:
+ vld1.8 d0,[r1]
+ vrev64.8 d0,d0
+ vst1.8 d0,[r6]!
+ sub r1,#1
+end_loop_copy:
+
+ ldrb r11, [r1], #-1
+ strb r11, [r6], #1
+
+ cmp r9, #-1
+ bge prologue_8_16_32
+
+ add r6, sp, r4 @ref_temp + nt
+ sub r6, r6, #2 @ref_temp + nt - 2
+
+ mov r12, #0xffffffff
+
+ rsb r9, r9, r12 @count to take care off ref_idx
+
+ add r1, r0, r4, lsl #1 @r1 = &src[2nt]
+
+ mov r7, #128 @inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add r7, r7, r8 @inv_ang_sum += inv_ang
+
+ ldrb r11, [r1, r7, lsr #8]
+ strb r11, [r6], #-1
+
+ subs r9, r9, #1
+
+ bne loop_copy_ref_idx
+
+prologue_8_16_32:
+ cmp r4, #4
+ beq sz_4_proc
+ ldr r14, col_for_intra_luma_addr_1
+ulbl_1:
+ add r14,r14,pc
+
+ lsr r10, r4, #3
+ vld1.8 d31, [r14]!
+ mul r10, r4, r10 @block counter (dec by #8)
+
+ mov r11, r4 @col counter to be inc/dec by #8
+ vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ mov r0, #1
+
+ sub r7, r5, #11
+ vdup.8 d2, r0 @contains #1 for adding to get ref_main_idx + 1
+ ldr r12, idx_neg_idx_11_17_addr_1 @load least idx table
+ulbl3:
+ add r12,r12,pc
+
+ mov r0, #2
+ vdup.8 d3, r0
+
+ add r12, r12, r7, lsl #4
+ mov r8, r12
+
+ mov r7, #8
+ sub r7, r7, r3, lsl #3 @r7 = 8-8r3
+
+ ldr r9, [r8]
+ add r1, sp, r4 @ref_temp + nt
+
+ vmovn.s16 d6, q11
+ vdup.8 d26, r9 @least idx added to final idx values
+ sub r1, r1, #1 @ref_temp + nt - 1
+
+ add r6, r1, r9
+
+ vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+ vshr.s16 q11, q11, #5
+
+ mov r0, #31
+ vdup.8 d29, r0 @contains #31 for vand operation
+
+ mov r0, #32
+ vdup.8 d28, r0
+
+ vqmovn.s16 d8, q11
+
+ vand d6, d6, d29 @fract values in d1/ idx values in d0
+
+ mov r0, #1
+ vdup.8 d27, r0 @row value inc or reset accordingly
+
+ vadd.s8 d8, d8, d27 @ref_main_idx (add row)
+ vsub.s8 d8, d8, d26 @ref_main_idx (row 0)
+ vadd.s8 d9, d8, d2 @ref_main_idx + 1 (row 0)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0)
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0)
+ vadd.s8 d4, d8, d2 @ref_main_idx (row 1)
+ vadd.s8 d5, d9, d2 @ref_main_idx + 1 (row 1)
+
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1)
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1)
+ vadd.s8 d8, d8, d3 @ref_main_idx (row 2)
+ vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 2)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2)
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2)
+ vadd.s8 d4, d4, d3 @ref_main_idx (row 3)
+ vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 3)
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3)
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3)
+ vadd.s8 d8, d8, d3 @ref_main_idx (row 4)
+ vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 4)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4)
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4)
+ vadd.s8 d4, d4, d3 @ref_main_idx (row 5)
+ vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 5)
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 5)
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5)
+ vadd.s8 d8, d8, d3 @ref_main_idx (row 6)
+ vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 6)
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6)
+ vmull.u8 q11, d16, d7 @mul (row 5)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6)
+ vadd.s8 d4, d4, d3 @ref_main_idx (row 7)
+ vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 7)
+
+ vst1.8 d24, [r2], r3 @st (row 4)
+ vrshrn.i16 d22, q11, #5 @round shft (row 5)
+
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7)
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7)
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d22, [r2], r3 @st (row 5)
+ vrshrn.i16 d20, q10, #5 @round shft (row 6)
+ vrshrn.i16 d18, q9, #5 @round shft (row 7)
+
+ vst1.8 d20, [r2], r3 @st (row 6)
+
+ subs r10, r10, #8 @subtract 8 and go to end if 8x8
+
+ vst1.8 d18, [r2], r3 @st (row 7)
+
+ beq end_func
+
+ subs r11, r11, #8
+ addgt r8, r8, #4
+ addgt r2, r2, r7
+ movle r8, r12
+ suble r2, r2, r4
+ addle r2, r2, #8
+ movle r11, r4
+ ldrle r14, col_for_intra_luma_addr_2
+ulbl_2:
+ addle r14,r14,pc
+ addle r0, r0, #8
+
+ mov r5,r2
+ vld1.8 d31, [r14]!
+ vmull.s8 q6, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ vmovn.s16 d10, q6
+ vshr.s16 q6, q6, #5
+ vqmovn.s16 d11, q6
+ vdup.8 d27, r0 @row value inc or reset accordingly
+ ldr r9, [r8]
+ add r9, r0, r9
+ sub r9, r9, #1
+ vdup.8 d26, r9
+ vadd.s8 d8, d27, d11 @ref_main_idx (add row)
+
+ sub r4,r4,#8
+
+kernel_8_16_32:
+
+ vsub.s8 d8, d8, d26 @ref_main_idx
+ vmov d26,d10
+
+ subs r11, r11, #8
+ add r6, r1, r9
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7)
+ vadd.s8 d9, d2, d8 @ref_main_idx + 1
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ addle r0, r0, #8
+ addgt r8, r8, #4
+ vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+
+ vst1.8 d24, [r5], r3 @st (row 4)
+ vrshrn.i16 d24, q11, #5 @round shft (row 5)
+
+ ldrle r14, col_for_intra_luma_addr_3
+ulbl_3:
+ addle r14,r14,pc
+ movle r8, r12
+ vdup.8 d27, r0 @row value inc or reset accordingly
+
+ vadd.s8 d4, d2, d8 @ref_main_idx (row 1)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0)
+ vadd.s8 d5, d2, d9 @ref_main_idx + 1 (row 1)
+
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vld1.8 d31, [r14]!
+ vand d6, d29, d26 @fract values in d1/ idx values in d0
+
+ vst1.8 d24, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vadd.s8 d8, d3, d8 @ref_main_idx (row 2)
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1)
+ vadd.s8 d9, d3, d9 @ref_main_idx + 1 (row 2)
+
+ addle r11, r4, #8
+ ldr r9, [r8]
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vadd.s8 d4, d4, d3 @ref_main_idx (row 3)
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2)
+ vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 3)
+
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+ vst1.8 d18, [r5], r3 @(from previous loop)st (row 7)
+
+ vadd.s8 d8, d8, d3 @ref_main_idx (row 4)
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3)
+ vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 4)
+
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ vmull.s8 q7, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ add r5,r2,r3,lsl#2
+ add r9, r0, r9
+
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vadd.s8 d4, d4, d3 @ref_main_idx (row 5)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4)
+ vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vmovn.s16 d10, q7
+ vshr.s16 q7, q7, #5
+
+ vadd.s8 d8, d8, d3 @ref_main_idx (row 6)
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 5)
+ vadd.s8 d9, d9, d3 @ref_main_idx + 1 (row 6)
+
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+
+ sub r9, r9, #1
+ vqmovn.s16 d11, q7
+
+ vadd.s8 d4, d4, d3 @ref_main_idx (row 7)
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6)
+ vadd.s8 d5, d5, d3 @ref_main_idx + 1 (row 7)
+
+ vmull.u8 q11, d16, d7 @mul (row 5)
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vadd.s8 d8, d27, d11 @ref_main_idx (add row)
+ vdup.8 d26, r9
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+
+ add r2,r3, lsl #2
+ addgt r2, r7, r2
+ suble r2, r2, r4
+
+ subs r10, r10, #8 @subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+epil_8_16_32:
+
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7)
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vst1.8 d24, [r5], r3 @st (row 4)
+ vrshrn.i16 d24, q11, #5 @round shft (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d24, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vst1.8 d18, [r5], r3 @st (row 7)
+
+
+ b end_func
+
+sz_4_proc:
+ ldr r14, col_for_intra_luma_addr_4
+ulbl_4:
+ add r14,r14,pc
+
+ vld1.8 d31, [r14]
+ mov r12, #1
+
+ vdup.8 d2, r12 @contains #1 for adding to get ref_main_idx + 1
+ mov r0, #2
+
+ vdup.8 d3, r0
+ ldr r12, idx_neg_idx_11_17_addr_2 @load least idx table
+ulbl4:
+ add r12,r12,pc
+
+ vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ sub r7, r5, #11
+
+ add r12, r12, r7, lsl #4
+ mov r8, r12
+
+ ldr r9, [r8]
+
+ vdup.8 d26, r9 @least idx added to final idx values
+ add r6, sp, r4 @ref_temp + nt
+
+ sub r6, r6, #1 @ref_temp + nt - 1
+ vmovn.s16 d6, q11
+ add r6, r6, r9
+
+ vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+ mov r0, #31
+
+ vdup.8 d29, r0 @contains #31 for vand operation
+ mov r1, #32
+
+ vdup.8 d28, r1
+
+ vshr.s16 q11, q11, #5
+ vqmovn.s16 d8, q11
+
+ vand d6, d6, d29 @fract values in d1/ idx values in d0
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vadd.s8 d8, d8, d2 @ref_main_idx (add 1)
+ vsub.s8 d8, d8, d26 @ref_main_idx
+ vadd.s8 d9, d8, d2 @ref_main_idx + 1
+
+ vadd.s8 d4, d8, d2 @row 1 ref_main_idx
+ vadd.s8 d5, d9, d2
+
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0)
+
+
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vadd.s8 d8, d8, d3 @idx (row 2)
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1)
+ vadd.s8 d9, d9, d3 @idx+1 (row 2)
+
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 2)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vrshrn.i16 d24, q12, #5 @round shift (row 0)
+
+ vadd.s8 d4, d4, d3 @idx (row 3)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2)
+ vadd.s8 d5, d5, d3 @idx+1 (row 3)
+
+ vmull.u8 q10, d12, d7 @mul (row 2)
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 3)
+ vmlal.u8 q10, d13, d6 @mul (row 2)
+
+ vst1.32 d24[0], [r2], r3 @st row 0
+ vrshrn.i16 d22, q11, #5 @round shift (row 1)
+
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3)
+
+ vmull.u8 q9, d16, d7 @mul (row 3)
+ vmlal.u8 q9, d17, d6 @mul (row 3)
+
+ vst1.32 d22[0], [r2], r3 @st row 1
+ vrshrn.i16 d20, q10, #5 @round shift (row 2)
+
+ vst1.32 d20[0], [r2], r3 @st row 2
+
+ vrshrn.i16 d18, q9, #5 @round shift (row 3)
+
+ vst1.32 d18[0], [r2], r3 @st (row 3)
+
+end_func:
+ add sp, sp, #132
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
new file mode 100644
index 0000000..af342bf
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -0,0 +1,653 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode_19_to_25.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* naveen sr
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_19_to_25(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_19_to_25_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl2 - 8
+
+gai4_ihevc_ang_table_addr_1:
+.long gai4_ihevc_ang_table - ulbl_1 - 8
+
+gai4_ihevc_ang_table_addr_2:
+.long gai4_ihevc_ang_table - ulbl_2 - 8
+
+.type ihevc_intra_pred_luma_mode_19_to_25_a9q, %function
+
+ihevc_intra_pred_luma_mode_19_to_25_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r7, gai4_ihevc_ang_table_addr_1
+ulbl_1:
+ add r7,r7,pc
+
+ ldr r5,[sp,#44] @mode (19 to 25)
+ ldr r8, gai4_ihevc_inv_ang_table_addr
+ulbl1:
+ add r8,r8,pc
+
+ add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode]
+ add r8, r8, r5, lsl #2 @gai4_ihevc_inv_ang_table
+ sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12]
+
+ ldr r7, [r7] @intra_pred_ang
+ sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1]
+
+ ldr r8, [r8] @inv_ang
+ add r6, sp, r4 @ref_temp + nt
+
+ mul r9, r4, r7 @nt*intra_pred_ang
+
+ sub r6, r6, #1 @ref_temp + nt - 1
+
+ add r1, r0, r4, lsl #1 @r1 = &src[2nt]
+ vdup.8 d30, r7 @intra_pred_ang
+
+ mov r7, r4
+
+ asr r9, r9, #5
+
+ vld1.32 d0[0],[r1]! @ pu1_ref[two_nt + k]
+
+ vst1.32 d0[0],[r6]! @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
+
+ subs r7, r7, #4
+ beq end_loop_copy
+ sub r1,#4
+ sub r6,#4
+ subs r7,r7,#4
+ beq loop_copy_8
+ subs r7,r7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ vld1.8 d0,[r1]!
+ vld1.8 d1,[r1]!
+ vld1.8 d2,[r1]!
+ vld1.8 d3,[r1]!
+
+ vst1.8 d0,[r6]!
+ vst1.8 d1,[r6]!
+ vst1.8 d2,[r6]!
+ vst1.8 d3,[r6]!
+ b end_loop_copy
+
+loop_copy_16:
+ vld1.8 d0,[r1]!
+ vld1.8 d1,[r1]!
+
+ vst1.8 d0,[r6]!
+ vst1.8 d1,[r6]!
+ b end_loop_copy
+
+loop_copy_8:
+ vld1.8 d0,[r1]!
+ vst1.8 d0,[r6]!
+
+end_loop_copy:
+
+ ldrb r11, [r1]
+ strb r11, [r6]
+
+ cmp r9, #-1
+ bge linear_filtering
+
+ add r6, sp, r4 @ref_temp + nt
+ sub r6, r6, #2 @ref_temp + nt - 2
+
+ mov r12, #0xffffffff
+
+ rsb r9, r9, r12 @count to take care off ref_idx
+
+ add r1, r0, r4, lsl #1 @r1 = &src[2nt]
+
+ mov r7, #128 @inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add r7, r7, r8 @inv_ang_sum += inv_ang
+ mov r14,r7,lsr #8
+ ldrb r11, [r1, -r14]
+@ ldrb r11, [r1, -r7, lsr #8]
+ strb r11, [r6], #-1
+
+ subs r9, r9, #1
+
+ bne loop_copy_ref_idx
+
+
+linear_filtering:
+@ after copy
+@ below code is taken from mode 27 to 33 and modified
+
+ ldr r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
+ulbl_2:
+ add r6,r6,pc
+
+ add r8,r6,r5,lsl #2 @*gai4_ihevc_ang_table[mode]
+ ldr r9,[r8] @intra_pred_ang = gai4_ihevc_ang_table[mode]
+ ldr r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+ add r1,r1,pc
+ add r6,r1,#1
+
+ add r8, sp, r4 @ref_temp + nt
+ sub r8,#1 @ref_temp + nt -1
+
+ tst r4,#7
+ mov lr,#0 @row
+ mov r12,r4
+ bne core_loop_4
+
+core_loop_8:
+ add r8,r8,#1 @pu1_ref_main_idx += (two_nt + 1)
+ vdup.8 d0,r9 @intra_pred_ang
+ mov r12,r4,lsr #3 @divide by 8
+
+ vmov.i8 d1,#32
+ mul r7,r4,r12
+
+ vmov.i16 q3,#31
+ @lsl r12,r3,#3
+
+ mov r1,r8
+ @sub r12,r12,r4
+ mov r5,r4
+ mov r11,#1
+
+prologue:
+ vld1.8 {d3},[r6] @loads the row value
+ vmull.s8 q1,d3,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.s16 d5,q1,#5 @idx = pos >> 5
+
+ vdup.8 d31,d4[0]
+ add r0,r2,r3
+
+ vmov.u32 lr,d5[0] @(i row)extract idx to the r register
+
+ vdup.8 d29,d4[1] @(ii)
+ sbfx r9,lr,#0,#8
+
+ add r10,r8,r9 @(i row)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d8},[r10],r11 @(i row)ref_main_idx
+ sbfx r9,lr,#8,#8
+
+ vld1.8 {d9},[r10] @(i row)ref_main_idx_1
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ sbfx r9,lr,#16,#8
+ vsub.u8 d30,d1,d31 @32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ sbfx r9,lr,#24,#8
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+
+ vdup.8 d31,d4[4] @(v)
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vmov.u32 lr,d5[1] @extract idx to the r register
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d10},[r2]! @(i row)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx r9,lr,#0,#8
+ vdup.8 d29,d4[5] @(vi)
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ sbfx r9,lr,#8,#8
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+ vdup.8 d27,d4[6] @(vii)
+
+ sbfx r9,lr,#16,#8
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d25,d4[7] @(viii)
+ sbfx r9,lr,#24,#8
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs r4,r4,#8
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ addgt r8,r8,#8
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subgt r7,r7,#8
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmull.s8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.s16 d3,q1,#5 @idx = pos >> 5
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+ sbfx r9,lr,#0,#8
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ vdup.8 d31,d4[0]
+ subs r4,r4,#8
+ sbfx r9,lr,#8,#8
+
+ vld1.8 {d8},[r10],r11 @(i)ref_main_idx
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+
+ addle r6,r6,#8 @increment the row value
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d9},[r10] @(i)ref_main_idx_1
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vdup.8 d29,d4[1] @(ii)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx r9,lr,#16,#8
+
+ vst1.8 {d14},[r0],r3 @(vi)
+ vsub.u8 d30,d1,d31 @(i)32-fract(dup_const_32_fract)
+
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx r9,lr,#24,#8
+ movle r4,r5 @reload nt
+
+ vmov.u32 lr,d3[1] @extract idx to the r register
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.s8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+
+ vst1.8 {d22},[r0] @(viii)
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx r9,lr,#0,#8
+ add r0,r2,r3
+
+ vdup.8 d31,d4[4] @(v)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+ sbfx r9,lr,#8,#8
+
+ vst1.8 {d10},[r2]! @(i)
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+
+ vdup.8 d29,d4[5] @(vi)
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vdup.8 d27,d4[6] @(vii)
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+ sbfx r9,lr,#16,#8
+
+ vdup.8 d25,d4[7] @(viii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vshrn.s16 d3,q1,#5 @idx = pos >> 5
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+ sbfx r9,lr,#24,#8
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+ movle r8,r1 @reload the source to pu1_src+2nt
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+
+ addgt r8,r8,#8 @increment the source next set 8 columns in same row
+ lslle r12,r3,#3
+ suble r12,r12,r5
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ addle r2,r2,r12 @increment the dst pointer to 8*dst_strd - nt
+ sbfx r9,lr,#0,#8
+
+ vmovn.i16 d4,q2
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ subs r7,r7,#8
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ vst1.8 {d14},[r0],r3 @(vi)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vst1.8 {d22},[r0],r3 @(viii)
+ b end_loops
+
+core_loop_4:
+ add r6,r8,#1 @pu1_ref_main_idx += 1
+ mov r8,#0
+
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ mov lr,r5,asr #5 @if(fract_prev > fract)
+ and r5,r5,#31 @fract = pos & (31)
+ add r10,r6,lr @pu1_ref_main_idx += 1
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+ vdup.8 d0,r5 @dup_const_fract
+ rsb r4,r5,#32
+ vdup.8 d1,r4 @dup_const_32_fract
+
+@inner_loop_4
+ vld1.32 {d2[0]},[r10] @ref_main_idx
+ add r8,r8,#1
+@ mov lr,r5 @fract_prev = fract
+
+ vld1.32 {d3[0]},[r11] @ref_main_idx_1
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ mov lr,r5,asr #5 @ pos >> 5
+ and r5,r5,#31 @fract = pos & (31)
+ add r10,r6,lr @pu1_ref_main_idx += 1
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d6,r5 @dup_const_fract
+ vmull.u8 q2,d2,d1 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d7,r4 @dup_const_32_fract
+ vmlal.u8 q2,d3,d0 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.32 {d8[0]},[r10] @ref_main_idx
+ add r8,r8,#1
+
+ vld1.32 {d9[0]},[r11] @ref_main_idx_1
+ vrshrn.i16 d4,q2,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+@ mov lr,r5 @fract_prev = fract
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ mov lr,r5,asr #5 @if(fract_prev > fract)
+ and r5,r5,#31 @fract = pos & (31)
+ add r10,r6,lr @ref_main + idx
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d12,r5 @dup_const_fract
+ vmull.u8 q5,d8,d7 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d13,r4 @dup_const_32_fract
+ vmlal.u8 q5,d9,d6 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.32 {d14[0]},[r10] @ref_main_idx
+ add r8,r8,#1
+
+ vst1.32 {d4[0]},[r2],r3
+ vrshrn.i16 d10,q5,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.32 {d15[0]},[r11] @ref_main_idx_1
+@ mov lr,r5 @fract_prev = fract
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ mov lr,r5,asr #5 @if(fract_prev > fract)
+ and r5,r5,#31 @fract = pos & (31)
+ add r10,r6,lr @pu1_ref_main_idx += 1
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d18,r5 @dup_const_fract
+ vmull.u8 q8,d14,d13 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d19,r4 @dup_const_32_fract
+ vmlal.u8 q8,d15,d12 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.32 {d20[0]},[r10] @ref_main_idx
+
+ vst1.32 {d10[0]},[r2],r3
+ vrshrn.i16 d16,q8,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+ vld1.32 {d21[0]},[r11] @ref_main_idx_1
+
+ vmull.u8 q11,d20,d19 @vmull_u8(ref_main_idx, dup_const_32_fract)
+ vmlal.u8 q11,d21,d18 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.32 {d16[0]},[r2],r3
+ vrshrn.i16 d22,q11,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ vst1.32 {d22[0]},[r2],r3
+
+end_loops:
+ add sp, sp, #132
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_filters_neon_intr.c b/common/arm/ihevc_intra_pred_filters_neon_intr.c
new file mode 100644
index 0000000..0e89de3
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_filters_neon_intr.c
@@ -0,0 +1,2920 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_intra_pred_filters_neon_intr.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Yogeswaran RS
+*
+* @par List of Functions:
+* - ihevc_intra_pred_luma_planar()
+* - ihevc_intra_pred_luma_dc()
+* - ihevc_intra_pred_luma_horz()
+* - ihevc_intra_pred_luma_ver()
+* - ihevc_intra_pred_luma_mode2()
+* - ihevc_intra_pred_luma_mode_18_34()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "arm_neon.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+
+/*****************************************************************************/
+/* Table Look-up */
+/*****************************************************************************/
+
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+ * Intra prediction interpolation filter for pu1_ref substitution
+ *
+ *
+ * @par Description:
+ * Reference substitution process for samples unavailable for prediction
+ * Refer to section 8.4.4.2.2
+ *
+ * @param[in] pu1_top_left
+ * UWORD8 pointer to the top-left
+ *
+ * @param[in] pu1_top
+ * UWORD8 pointer to the top
+ *
+ * @param[in] pu1_left
+ * UWORD8 pointer to the left
+ *
+ * @param[in] src_strd
+ * WORD32 Source stride
+ *
+ * @param[in] nbr_flags
+ * WORD32 neighbor availability flags
+ *
+ * @param[in] nt
+ * WORD32 transform Block size
+ *
+ * @param[in] dst_strd
+ * WORD32 Destination stride
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd)
+{
+ UWORD8 pu1_ref;
+ WORD32 dc_val, i;
+ WORD32 total_samples = (4 * nt) + 1;
+ WORD32 two_nt = 2 * nt;
+ WORD32 three_nt = 3 * nt;
+ WORD32 get_bits;
+ WORD32 next;
+ WORD32 bot_left, left, top, tp_right, tp_left;
+ WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+ UNUSED(dst_strd);
+ dc_val = 1 << (BIT_DEPTH - 1);
+
+ /* Neighbor Flag Structure*/
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ 1 4 4 4 4
+ */
+
+ /* If no neighbor flags are present, fill the neighbor samples with DC value */
+ if(nbr_flags == 0)
+ {
+ for(i = 0; i < total_samples; i++)
+ {
+ pu1_dst[i] = dc_val;
+ }
+ }
+ else
+ {
+ /* Else fill the corresponding samples */
+ pu1_dst[two_nt] = *pu1_top_left;
+ UWORD8 *pu1_dst_tmp2 = pu1_dst;
+ UWORD8 *pu1_top_tmp = pu1_top;
+ pu1_dst_tmp2 += two_nt + 1;
+
+ for(i = 0; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+
+ uint8x8_t src;
+ for(i = two_nt; i > 0; i -= 8)
+ {
+ src = vld1_u8(pu1_top_tmp);
+ pu1_top_tmp += 8;
+ vst1_u8(pu1_dst_tmp2, src);
+ pu1_dst_tmp2 += 8;
+ }
+
+ if(nt <= 8)
+ {
+ /* 1 bit extraction for all the neighboring blocks */
+ tp_left = (nbr_flags & 0x10000) >> 16;
+ bot_left = nbr_flags & 0x1;
+ left = (nbr_flags & 0x10) >> 4;
+ top = (nbr_flags & 0x100) >> 8;
+ tp_right = (nbr_flags & 0x1000) >> 12;
+
+ next = 1;
+
+ /* If bottom -left is not available, reverse substitution process*/
+ if(bot_left == 0)
+ {
+ WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
+
+ /* Check for the 1st available sample from bottom-left*/
+ while(!a_nbr_flag[next])
+ next++;
+
+ /* If Left, top-left are available*/
+ if(next <= 2)
+ {
+ idx = nt * next;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ else /* If top, top-right are available */
+ {
+ /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+ idx = (nt * (next - 1)) + 1;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* Forward Substitution Process */
+ /* If left is Unavailable, copy the last bottom-left value */
+
+ if(left == 0)
+ {
+ uint8x8_t dup_pu1_dst1;
+ UWORD8 *pu1_dst_const_nt = pu1_dst;
+ pu1_dst_const_nt += nt;
+
+ if(0 == (nt & 7))
+ {
+ dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
+ for(i = nt; i > 0; i -= 8)
+ {
+ vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
+ pu1_dst_const_nt += 8;
+
+ }
+ }
+ else
+ {
+ //uint32x2_t dup_pu1_dst4;
+ dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
+ //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
+ for(i = nt; i > 0; i -= 4)
+ {
+ vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
+ pu1_dst_const_nt += 4;
+
+ }
+
+ }
+
+ }
+ if(tp_left == 0)
+ pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+ if(top == 0)
+ {
+
+ if(0 == (nt & 7))
+ {
+ uint8x8_t dup_pu1_dst2;
+ UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
+ pu1_dst_const_two_nt_1 += (two_nt + 1);
+ dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
+ for(i = nt; i > 0; i -= 8)
+ {
+ vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
+ pu1_dst_const_two_nt_1 += 8;
+
+ }
+ }
+ else
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
+ }
+ }
+ if(tp_right == 0)
+ {
+ uint8x8_t dup_pu1_dst3;
+ UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
+ pu1_dst_const_three_nt_1 += (three_nt + 1);
+ dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
+ if(0 == (nt & 7))
+ {
+ for(i = nt; i > 0; i -= 8)
+ {
+ vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
+ pu1_dst_const_three_nt_1 += 8;
+
+ }
+ }
+ else
+ {
+ for(i = nt; i > 0; i -= 4)
+ {
+ vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
+ pu1_dst_const_three_nt_1 += 4;
+ }
+
+ }
+
+ }
+ }
+ if(nt == 16)
+ {
+ WORD32 nbr_flags_temp = 0;
+ nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
+ + ((nbr_flags & 0x300) >> 4)
+ + ((nbr_flags & 0x3000) >> 6)
+ + ((nbr_flags & 0x10000) >> 8);
+
+ /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ nbr_id_from_bl = 32;
+
+ if(nbr_id_from_bl == 32)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags_temp >> 8) & 0x1))
+ {
+ nbr_id_from_bl++;
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ {
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T16_4NT) + 1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T16_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags_temp, 8);
+
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ for(i = 0; i < 8; i++)
+ pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+ }
+ }
+
+ if(nt == 32)
+ {
+ /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags >> 16) & 0x1))
+ {
+ /* top left not available */
+ nbr_id_from_bl++;
+ /* top and top right; 8 pels per nbr bit */
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T32_4NT)+1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T32_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags, 16);
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ for(i = 0; i < 8; i++)
+ pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+ }
+ }
+
+ }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Intra prediction interpolation filter for ref_filtering
+ *
+ *
+ * @par Description:
+ * Reference DC filtering for neighboring samples dependent on TU size and
+ * mode Refer to section 8.4.4.2.3 in the standard
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] nt
+ * integer Transform Block size
+ *
+ * @param[in] mode
+ * integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
+ WORD32 nt,
+ UWORD8 *pu1_dst,
+ WORD32 mode,
+ WORD32 strong_intra_smoothing_enable_flag)
+{
+ WORD32 filter_flag;
+ WORD32 i = 0;
+ WORD32 four_nt = 4 * nt;
+
+ WORD32 src_4nt;
+
+ /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */
+ /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */
+ UWORD8 *pu1_src_tmp_0 = pu1_src;
+ UWORD8 *pu1_src_tmp_1;
+ UWORD8 *pu1_src_tmp_2;
+ UWORD8 *pu1_dst_tmp_0 = pu1_dst;
+ UWORD8 *pu1_dst_tmp_1;
+
+ uint8x8_t src_val_0, src_val_2;
+ uint8x8_t src_val_1, shift_res;
+ uint8x8_t dup_const_2;
+ uint16x8_t mul_res, add_res;
+ WORD32 bi_linear_int_flag = 0;
+ WORD32 abs_cond_left_flag = 0;
+ WORD32 abs_cond_top_flag = 0;
+ WORD32 dc_val = 1 << (BIT_DEPTH - 5);
+ shift_res = vdup_n_u8(0);
+
+ filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+
+ if(0 == filter_flag)
+ {
+ if(pu1_src == pu1_dst)
+ {
+ return;
+ }
+ else
+ {
+ for(i = four_nt; i > 0; i -= 8)
+ {
+ src_val_0 = vld1_u8(pu1_src_tmp_0);
+ pu1_src_tmp_0 += 8;
+ vst1_u8(pu1_dst_tmp_0, src_val_0);
+ pu1_dst_tmp_0 += 8;
+ }
+ pu1_dst[four_nt] = pu1_src[four_nt];
+ }
+ }
+
+ else
+ {
+ /* If strong intra smoothin is enabled and transform size is 32 */
+ if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+ {
+ /*Strong Intra Filtering*/
+ abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
+ - (2 * pu1_src[3 * nt]))) < dc_val;
+ abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
+ - (2 * pu1_src[nt]))) < dc_val;
+
+ bi_linear_int_flag = ((1 == abs_cond_left_flag)
+ && (1 == abs_cond_top_flag));
+ }
+
+ src_4nt = pu1_src[4 * nt];
+ /* Strong filtering of reference samples */
+ if(1 == bi_linear_int_flag)
+ {
+ WORD32 two_nt = four_nt >> 1;
+
+ WORD32 pu1_src_0_val = pu1_src[0];
+ WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
+ WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
+
+ WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
+ uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
+
+ WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
+ uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
+
+ const UWORD8 *const_col_i;
+ uint8x8_t const_col_i_val;
+ uint16x8_t prod_val_1;
+ uint16x8_t prod_val_2;
+ uint16x8_t prod_val_3;
+ uint16x8_t prod_val_4;
+ uint8x8_t res_val_1;
+ uint8x8_t res_val_2;
+ uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
+ uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
+ uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
+ pu1_dst_tmp_0 = pu1_dst + 1;
+ pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
+
+ const_col_i = gau1_ihevc_planar_factor + 1;
+
+ for(i = two_nt; i > 0; i -= 8)
+ {
+ const_col_i_val = vld1_u8(const_col_i);
+ const_col_i += 8;
+
+ prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
+ prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
+
+ res_val_1 = vrshrn_n_u16(prod_val_2, 6);
+ prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
+
+ vst1_u8(pu1_dst_tmp_0, res_val_1);
+ pu1_dst_tmp_0 += 8;
+ prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
+
+ res_val_2 = vrshrn_n_u16(prod_val_4, 6);
+ vst1_u8(pu1_dst_tmp_1, res_val_2);
+ pu1_dst_tmp_1 += 8;
+ }
+ pu1_dst[2 * nt] = pu1_src[2 * nt];
+ }
+ else
+ {
+ pu1_src_tmp_1 = pu1_src + 1;
+ pu1_src_tmp_2 = pu1_src + 2;
+ pu1_dst_tmp_0 += 1;
+
+ dup_const_2 = vdup_n_u8(2);
+
+ /* Extremities Untouched*/
+ pu1_dst[0] = pu1_src[0];
+
+ /* To avoid the issue when the dest and src has the same pointer this load has been done
+ * outside and the 2nd consecutive load is done before the store of the 1st */
+
+ /* Perform bilinear filtering of Reference Samples */
+ for(i = (four_nt - 1); i > 0; i -= 8)
+ {
+ src_val_0 = vld1_u8(pu1_src_tmp_0);
+ pu1_src_tmp_0 += 8;
+
+ src_val_2 = vld1_u8(pu1_src_tmp_2);
+ pu1_src_tmp_2 += 8;
+
+ src_val_1 = vld1_u8(pu1_src_tmp_1);
+ pu1_src_tmp_1 += 8;
+
+ if(i < four_nt - 1)
+ {
+ vst1_u8(pu1_dst_tmp_0, shift_res);
+ pu1_dst_tmp_0 += 8;
+ }
+
+ add_res = vaddl_u8(src_val_0, src_val_2);
+
+ mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
+ shift_res = vrshrn_n_u16(mul_res, 2);
+
+ }
+ vst1_u8(pu1_dst_tmp_0, shift_res);
+ pu1_dst_tmp_0 += 8;
+ }
+ pu1_dst[4 * nt] = src_4nt;
+
+ }
+
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+* Intra prediction interpolation filter for luma planar
+*
+* @par Description:
+* Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */
+ /* load const_nt_1_col values into a d register */
+ /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */
+ /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */
+ /* log2nt + 1 is taken care while assigning the values itself */
+ /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
+
+ WORD32 row, col = 0;
+ WORD32 log2nt_plus1 = 6;
+ WORD32 two_nt, three_nt;
+ UWORD8 *pu1_ref_two_nt_1;
+ UWORD8 *pu1_dst_tmp;
+ const UWORD8 *const_nt_1_col;
+ uint8x8_t const_nt_1_col_t;
+ const UWORD8 *const_col_1;
+ uint8x8_t const_col_1_t;
+ uint8_t const_nt_1_row;
+ uint8x8_t const_nt_1_row_dup;
+ uint8_t const_row_1;
+ uint8x8_t const_row_1_dup;
+ uint8_t const_nt = nt;
+ uint16x8_t const_nt_dup;
+ uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
+ uint8x8_t pu1_ref_nt_1_dup;
+ uint8_t pu1_ref_two_nt_1_row;
+ uint8_t pu1_ref_three_nt_1;
+ uint8x8_t pu1_ref_two_nt_1_row_dup;
+ uint8x8_t pu1_ref_two_nt_1_t;
+ uint8x8_t pu1_ref_three_nt_1_dup;
+ uint16x8_t prod_t1;
+ uint16x8_t prod_t2;
+ uint16x8_t sto_res_tmp;
+ uint8x8_t sto_res;
+ int16x8_t log2nt_dup;
+ UNUSED(src_strd);
+ UNUSED(mode);
+ log2nt_plus1 = 32 - CLZ(nt);
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+ /* loops have been unrolld considering the fact width is multiple of 8 */
+ if(0 == (nt & 7))
+ {
+ pu1_dst_tmp = pu1_dst;
+ const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
+
+ const_col_1 = gau1_ihevc_planar_factor + 1;
+ pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
+
+ pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
+ const_nt_dup = vdupq_n_u16(const_nt);
+
+ log2nt_dup = vdupq_n_s16(log2nt_plus1);
+ log2nt_dup = vnegq_s16(log2nt_dup);
+
+ pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
+
+ for(row = 0; row < nt; row++)
+ {
+ pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
+ pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
+
+ const_nt_1_row = nt - 1 - row;
+ const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
+
+ const_row_1 = row + 1;
+ const_row_1_dup = vdup_n_u8(const_row_1);
+
+ const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
+
+ const_col_1 = gau1_ihevc_planar_factor + 1;
+ pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
+
+ for(col = nt; col > 0; col -= 8)
+ {
+ const_nt_1_col_t = vld1_u8(const_nt_1_col);
+ const_nt_1_col -= 8;
+ const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
+
+ const_col_1_t = vld1_u8(const_col_1);
+ const_col_1 += 8;
+ prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
+
+ pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
+ pu1_ref_two_nt_1 += 8;
+ prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
+
+ prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
+ prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
+ prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
+ prod_t1 = vaddq_u16(prod_t1, prod_t2);
+
+ sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
+ sto_res = vmovn_u16(sto_res_tmp);
+ vst1_u8(pu1_dst_tmp, sto_res);
+ pu1_dst_tmp += 8;
+ }
+ pu1_dst_tmp += dst_strd - nt;
+ }
+ }
+ /* loops have been unrolld considering the fact width is multiple of 4 */
+ /* If column is multiple of 4 then height should be multiple of 2 */
+ else
+ {
+ uint8x8_t const_row_1_dup1;
+ uint8x8_t pu1_ref_two_nt_1_t1;
+ uint8x8_t const_nt_1_col_t1;
+ uint8x8_t const_col_1_t1;
+ uint8x8_t pu1_ref_two_nt_1_row_dup1;
+ uint8x8_t const_nt_1_row_dup1;
+
+ pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
+
+ pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
+ const_nt_dup = vdupq_n_u16(const_nt);
+
+ log2nt_dup = vdupq_n_s16(log2nt_plus1);
+ log2nt_dup = vnegq_s16(log2nt_dup);
+
+ pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
+
+ for(row = 0; row < nt; row += 2)
+ {
+ pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
+ pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
+ pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
+ pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
+ pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
+
+ const_nt_1_row = nt - 1 - row;
+ const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
+ const_nt_1_row = nt - 2 - row;
+ const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
+ const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
+
+ const_row_1 = row + 1;
+ const_row_1_dup = vdup_n_u8(const_row_1);
+ const_row_1 = row + 2;
+ const_row_1_dup1 = vdup_n_u8(const_row_1);
+ const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
+
+ const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
+
+ const_col_1 = gau1_ihevc_planar_factor + 1;
+
+ pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
+
+ for(col = nt; col > 0; col -= 4)
+ {
+ const_nt_1_col_t = vld1_u8(const_nt_1_col);
+ const_nt_1_col -= 4;
+ const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
+
+ const_col_1_t = vld1_u8(const_col_1);
+ const_col_1 += 4;
+ const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
+
+ pu1_dst_tmp = pu1_dst;
+ const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
+
+ const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
+ prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
+
+ pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
+ pu1_ref_two_nt_1 += 4;
+ const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
+
+ pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
+ prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
+
+ pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
+ prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
+
+ prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
+ prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
+ prod_t1 = vaddq_u16(prod_t1, prod_t2);
+
+ sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
+ sto_res = vmovn_u16(sto_res_tmp);
+
+ vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+ pu1_dst_tmp += dst_strd;
+
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ pu1_dst += 4;
+ }
+ pu1_dst += 2 * dst_strd - nt;
+ }
+ }
+
+}
+/* INTRA_PRED_LUMA_PLANAR */
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
+ WORD32 i = 0;
+ WORD32 row = 0, col = 0, col_count;
+ WORD32 log2nt_plus1 = 6;
+ WORD32 two_nt = 0;
+ uint16x8_t ref_load_q;
+ uint16x8_t three_dc_val_t;
+ uint8x8_t sto_res_tmp;
+ uint8x8_t sto_res_tmp1;
+ uint8x8_t sto_res_tmp2;
+ uint8x8_t sto_res_tmp3;
+ uint8x8_t sto_res_tmp4;
+ uint8x8_t dc_val_t;
+
+ UWORD8 *pu1_ref_tmp;
+ UWORD8 *pu1_ref_tmp1;
+ UWORD8 *pu1_dst_tmp;
+ UWORD8 *pu1_dst_tmp1;
+ UWORD8 *pu1_dst_tmp2;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ /* log2nt + 1 is taken care while assigning the values itself. */
+ log2nt_plus1 = 32 - CLZ(nt);
+
+ /* loops have been unrolld considering the fact width is multiple of 8 */
+ if(0 == (nt & 7))
+ {
+ uint8x8_t ref_load1;
+ uint8x8_t ref_load2;
+ uint16x4_t acc_dc_pair1;
+ uint32x2_t acc_dc_pair2;
+ uint64x1_t acc_dc = vdup_n_u64(col);
+
+ two_nt = 2 * nt;
+ pu1_ref_tmp = pu1_ref + nt;
+ pu1_ref_tmp1 = pu1_ref + two_nt + 1;
+
+ for(i = two_nt; i > nt; i -= 8)
+ {
+ ref_load1 = vld1_u8(pu1_ref_tmp);
+ pu1_ref_tmp += 8;
+ acc_dc_pair1 = vpaddl_u8(ref_load1);
+
+ ref_load2 = vld1_u8(pu1_ref_tmp1);
+ pu1_ref_tmp1 += 8;
+
+ acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
+ acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
+
+ acc_dc_pair1 = vpaddl_u8(ref_load2);
+ acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
+ acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
+ }
+
+ dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
+ dc_val_t = vdup_n_u8(dc_val);
+ two_dc_val = 2 * dc_val;
+ three_dc_val = 3 * dc_val;
+ three_dc_val += 2;
+
+ three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
+ pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
+ pu1_dst_tmp = pu1_dst;
+
+
+ if(nt == 32)
+ {
+ for(row = 0; row < nt; row++)
+ {
+ for(col = nt; col > 0; col -= 8)
+ {
+ vst1_u8(pu1_dst_tmp, dc_val_t);
+ pu1_dst_tmp += 8;
+ }
+ pu1_dst_tmp += dst_strd - nt;
+ }
+ }
+ else
+
+ {
+ for(col = nt; col > 0; col -= 8)
+ {
+ ref_load1 = vld1_u8(pu1_ref_tmp);
+ pu1_ref_tmp += 8;
+ ref_load_q = vmovl_u8(ref_load1);
+ ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
+ ref_load_q = vshrq_n_u16(ref_load_q, 2);
+ sto_res_tmp = vmovn_u16(ref_load_q);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp);
+ pu1_dst_tmp += 8;
+ }
+
+ pu1_ref_tmp = pu1_ref + two_nt - 9;
+ pu1_dst_tmp = pu1_dst + dst_strd;
+ col_count = nt - 8;
+
+ /* Except the first row the remaining rows are done here */
+ /* Both column and row has been unrolled by 8 */
+ /* Store has been taken care for the unrolling */
+ /* Except the 1st column of the remaining rows(other than 1st row), the values are */
+ /* constant hence it is extracted with an constant value and stored */
+ /* If the column is greater than 8, then the remaining values are constant which is */
+ /* taken care in the inner for loop */
+
+ for(row = nt; row > 0; row -= 8)
+ {
+ pu1_dst_tmp1 = pu1_dst_tmp + 8;
+ ref_load1 = vld1_u8(pu1_ref_tmp);
+ pu1_ref_tmp -= 8;
+ ref_load_q = vmovl_u8(ref_load1);
+ ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
+ ref_load_q = vshrq_n_u16(ref_load_q, 2);
+ sto_res_tmp = vmovn_u16(ref_load_q);
+
+ sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
+
+ sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
+ sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp1);
+ pu1_dst_tmp += dst_strd;
+
+ sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
+ sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp2);
+ pu1_dst_tmp += dst_strd;
+
+ sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
+ sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp3);
+ pu1_dst_tmp += dst_strd;
+
+ sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
+ sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp4);
+ pu1_dst_tmp += dst_strd;
+
+ sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
+ sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp1);
+ pu1_dst_tmp += dst_strd;
+
+ sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
+ sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp2);
+ pu1_dst_tmp += dst_strd;
+
+ sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
+ sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
+ vst1_u8(pu1_dst_tmp, sto_res_tmp3);
+ pu1_dst_tmp += dst_strd;
+ /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
+ if(row != 8)
+ vst1_u8(pu1_dst_tmp, sto_res_tmp4);
+ pu1_dst_tmp += dst_strd;
+
+ for(col = col_count; col > 0; col -= 8)
+ {
+ pu1_dst_tmp2 = pu1_dst_tmp1;
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 += dst_strd;
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 += dst_strd;
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 += dst_strd;
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 += dst_strd;
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 += dst_strd;
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 += dst_strd;
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 += dst_strd;
+
+ /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
+ if(row != 8)
+ vst1_u8(pu1_dst_tmp1, dc_val_t);
+ pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
+ }
+ }
+ pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
+ }
+ }
+ /* loops have been unrolld considering the fact width is multiple of 4 */
+ else
+ {
+ WORD32 acc_dc;
+ two_nt = 2 * nt;
+
+ acc_dc = 0;
+ pu1_ref_tmp = pu1_ref + nt + 1;
+ for(i = nt; i < two_nt; i++)
+ {
+ acc_dc += pu1_ref[i];
+ acc_dc += pu1_ref_tmp[i];
+ }
+ dc_val = (acc_dc + nt) >> (log2nt_plus1);
+ two_dc_val = 2 * dc_val;
+ three_dc_val = 3 * dc_val;
+ three_dc_val = three_dc_val + 2;
+ dc_val_t = vdup_n_u8(dc_val);
+
+ if(nt == 32)
+ {
+ pu1_dst_tmp = pu1_dst;
+ for(row = 0; row < nt; row++)
+ {
+ for(col = nt; col > 0; col -= 4)
+ {
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
+ pu1_dst_tmp += 4;
+ }
+ pu1_dst_tmp += dst_strd - nt;
+ }
+ }
+ else
+
+ {
+ for(col = 1; col < nt; col++)
+ {
+ pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
+ }
+
+ pu1_dst_tmp = pu1_dst + dst_strd + 0;
+ /* Since first row is already updated before, loop count is nt-1 */
+ for(row = nt - 1; row > 0; row -= 1)
+ {
+ for(col = nt; col > 0; col -= 4)
+ {
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
+ pu1_dst_tmp += 4;
+ }
+ pu1_dst_tmp += dst_strd - nt;
+ }
+
+ for(row = 1; row < nt; row++)
+ {
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
+ }
+ pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
+ }
+ }
+}
+/* INTRA_PRED_LUMA_DC */
+
+/**
+*******************************************************************************
+*
+* @brief
+ * Intra prediction interpolation filter for horizontal luma variable.
+ *
+ * @par Description:
+ * Horizontal intraprediction with reference neighboring samples location
+ * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] nt
+ * integer Transform Block size
+ *
+ * @param[in] wd
+ * integer width of the array
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 two_nt;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ two_nt = 2 * nt;
+
+
+ UWORD8 *pu1_dst_tmp = pu1_dst;
+ UWORD32 pu1_val;
+ uint8x8_t pu1_val_two_nt_1_row;
+ if(nt == 32)
+ {
+ pu1_dst_tmp = pu1_dst;
+ for(row = 0; row < nt; row++)
+ {
+ pu1_val = pu1_ref[two_nt - 1 - row];
+ pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
+ for(col = nt; col > 0; col -= 8)
+ {
+ vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
+ pu1_dst_tmp += 8;
+ }
+ pu1_dst_tmp += dst_strd - nt;
+ }
+ }
+ else
+
+
+ /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
+ /* naming of variables made according to the operation(instructions) it performs*/
+ /* (eg. shift_val which contains the shifted value, */
+ /* add_sat which has add and saturated value) */
+ /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
+ /* rows and columns are unrolled by 4, when the width is multiple of 4 */
+ {
+ if(0 != (nt & 7)) /* cond for multiple of 4 */
+ {
+ UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
+ UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
+ UWORD8 *pu1_dst_4 = pu1_dst;
+ UWORD8 *pu1_dst_4_tmp = pu1_dst;
+
+ uint32x2_t pu1_ref_val1, pu1_ref_val2;
+ uint8x8_t dup_sub, round_val, dup_val;
+ uint16x8_t dup_add, sub_val;
+ int16x8_t shift_val, add_sat;
+
+ pu1_ref_val1 = vdup_n_u32(0);
+ pu1_ref_val2 = vdup_n_u32(0);
+
+ dup_sub = vdup_n_u8(pu1_ref[two_nt]);
+
+ dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
+
+ pu1_ref_4_two_nt_plus1 += (two_nt + 1);
+
+ pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
+
+ for(row = nt; row > 0; row -= 4)
+ {
+ for(col = nt; col > 0; col -= 4)
+ {
+ pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
+ sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
+ shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
+
+ add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
+ round_val = vqmovun_s16(add_sat);
+ vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
+ pu1_dst_4 += dst_strd;
+
+ pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
+ dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
+ vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
+ pu1_dst_4 += dst_strd;
+
+ dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
+ vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
+ pu1_dst_4 += dst_strd;
+
+ dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
+ vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
+ pu1_dst_4 += dst_strd;
+
+
+ }
+ /* worst cases */
+ pu1_ref_4_two_nt_minus_nt += 3;
+ pu1_ref_4_two_nt_plus1 += 4;
+ pu1_dst_4 = (pu1_dst_4_tmp + 4);
+ }
+
+ }
+
+ /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
+ /* naming of variables made according to the operation(instructions) it performs */
+ /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
+ /* rows and columns are unrolled by 8, when the width is multiple of 8 */
+
+ else
+ {
+ UWORD8 *pu1_ref_tmp_1 = pu1_ref;
+ UWORD8 *pu1_ref_tmp_2 = pu1_ref;
+
+ UWORD8 *pu1_dst_tmp_1 = pu1_dst;
+ UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
+ UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
+
+ uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
+ uint16x8_t sub_res, dup_add;
+ int16x8_t shift_res, add_res;
+
+ dup_sub = vdup_n_u8(pu1_ref[two_nt]);
+ dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
+
+ pu1_ref_tmp_1 += (two_nt + 1);
+ pu1_ref_tmp_2 += (two_nt - 1);
+
+ for(col = nt; col > 0; col -= 8)
+ {
+ src_tmp = vld1_u8(pu1_ref_tmp_1);
+ pu1_ref_tmp_1 += 8;
+
+ sub_res = vsubl_u8(src_tmp, dup_sub);
+ shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
+ add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
+ round_val = vqmovun_s16(add_res);
+ vst1_u8(pu1_dst_tmp_1, round_val);
+ pu1_dst_tmp_1 += 8;
+ }
+
+ for(row = nt; row > 0; row -= 8)
+ {
+ pu1_ref_tmp_2 -= 8;
+
+ src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
+ rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
+
+ dup_1 = vdup_lane_u8(rev_res, 0);
+ dup_2 = vdup_lane_u8(rev_res, 1);
+ dup_3 = vdup_lane_u8(rev_res, 2);
+ dup_4 = vdup_lane_u8(rev_res, 3);
+ dup_5 = vdup_lane_u8(rev_res, 4);
+ dup_6 = vdup_lane_u8(rev_res, 5);
+ dup_7 = vdup_lane_u8(rev_res, 6);
+ dup_8 = vdup_lane_u8(rev_res, 7);
+
+ for(col = nt; col > 0; col -= 8)
+ {
+ pu1_dst_tmp_2 = pu1_dst_tmp_3;
+
+ vst1_u8(pu1_dst_tmp_2, dup_1);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, dup_2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, dup_3);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, dup_4);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, dup_5);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, dup_6);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, dup_7);
+ pu1_dst_tmp_2 += dst_strd;
+
+ /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
+ if(row != 8)
+ vst1_u8(pu1_dst_tmp_2, dup_8);
+ pu1_dst_tmp_2 += dst_strd;
+
+ pu1_dst_tmp_3 += 8;
+ }
+ pu1_dst_tmp_2 -= (nt - 8);
+ pu1_dst_tmp_3 = pu1_dst_tmp_2;
+ }
+ }
+ }
+}
+/* INTRA_PRED_LUMA_HORZ */
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+* Horizontal intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 two_nt;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ two_nt = 2 * nt;
+
+ UWORD8 *pu1_dst_tmp = pu1_dst;
+ UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
+ uint8x8_t pu1_val_two_nt_1_col;
+ if(nt == 32)
+ {
+ pu1_dst_tmp = pu1_dst;
+ for(row = 0; row < nt; row++)
+ {
+ for(col = nt; col > 0; col -= 8)
+ {
+ pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
+ pu1_ref_tmp_1 += 8;
+ vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
+ pu1_dst_tmp += 8;
+ }
+ pu1_ref_tmp_1 -= nt;
+ pu1_dst_tmp += dst_strd - nt;
+ }
+ }
+ else
+
+ {
+ /* naming of variables made according to the operation(instructions) it performs */
+ /* (eg. shift_val which contains the shifted value, */
+ /* add_sat which has add and saturated value) */
+ /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
+ /* rows and columns are unrolled by 4, when the width is multiple of 4 */
+
+ if(0 != (nt & 7))
+ {
+ WORD32 cond_4 = 0;
+ UWORD8 *pu1_ref_val1 = pu1_ref;
+ UWORD8 *pu1_ref_val2 = pu1_ref;
+ UWORD8 *pu1_ref_val3 = pu1_ref;
+
+ UWORD8 *pu1_dst_val1 = pu1_dst;
+ UWORD8 *pu1_dst_val2 = pu1_dst;
+ UWORD8 *pu1_dst_val3 = pu1_dst;
+
+ uint8x8_t dup_2_sub, round_val, vext_val;
+ uint16x8_t dup_2_add;
+ uint32x2_t src_val1, src_val2, src_val3;
+ uint16x8_t sub_val;
+ int16x8_t shift_val1, add_sat;
+ uint64x1_t shift_val2;
+
+ src_val1 = vdup_n_u32(0);
+ src_val2 = vdup_n_u32(0);
+ src_val3 = vdup_n_u32(0);
+ pu1_ref_val1 += (two_nt - nt);
+ pu1_ref_val3 += (two_nt + 2);
+ pu1_ref_val2 += (two_nt + 1);
+
+ dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
+ dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
+
+ /* loops to store the first nt sets of values in the destination */
+
+ for(row = nt; row > 0; row -= 4)
+ {
+ for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
+ {
+ /* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
+ src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
+ sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
+ shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
+ add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
+ round_val = vqmovun_s16(add_sat);
+
+ /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
+ src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
+ vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
+ vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+ pu1_dst_val1 += dst_strd;
+
+ shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
+
+ vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
+ vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+ pu1_dst_val1 += dst_strd;
+
+ shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
+
+ vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
+ vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+ pu1_dst_val1 += dst_strd;
+
+ shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
+
+ vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
+ vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
+ pu1_dst_val1 += dst_strd;
+
+ pu1_ref_val1 -= 4;
+ }
+
+ /* loop to store next sets of eight values in the destination */
+
+ for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
+ {
+ src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
+
+ vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+ pu1_dst_val2 += dst_strd;
+
+ vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+ pu1_dst_val2 += dst_strd;
+
+ vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+ pu1_dst_val2 += dst_strd;
+
+ vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
+ pu1_dst_val2 += dst_strd;
+ }
+ pu1_ref_val2 += 4;
+ pu1_dst_val3 += 4;
+ pu1_dst_val2 = pu1_dst_val3;
+ cond_4 = 1;
+ }
+ }
+
+ /* rows and columns are unrolled by 8, when the width is multiple of 8 */
+ else
+ {
+ WORD32 cond = 0, col_1;
+ UWORD8 *pu1_dst_tmp_1 = pu1_dst;
+ UWORD8 *pu1_dst_tmp_2 = pu1_dst;
+ UWORD8 *pu1_dst_tmp_3 = pu1_dst;
+
+ UWORD8 *pu1_ref_tmp_1 = pu1_ref;
+ UWORD8 *pu1_ref_tmp_2 = pu1_ref;
+ UWORD8 *pu1_ref_tmp_3 = pu1_ref;
+
+ uint8x8_t pu1_src_tmp1;
+ uint8x8_t pu1_src_tmp2;
+
+ uint8x8_t dup_sub;
+ uint16x8_t dup_add;
+ int16x8_t subsh_val;
+ int16x8_t addsat_val;
+ uint16x8_t sub_val;
+ uint8x8_t round_val;
+ uint8x8_t vext_t;
+ uint64x1_t shift_64;
+
+ dup_sub = vdup_n_u8(pu1_ref[two_nt]);
+ dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
+
+ pu1_ref_tmp_1 += (two_nt);
+ pu1_ref_tmp_1 -= 8;
+ pu1_ref_tmp_2 += (two_nt + 2);
+ pu1_ref_tmp_3 += (two_nt + 1);
+
+ /* loops to store the first nt sets of values in the destination */
+
+ for(row = nt; row > 0; row -= 8)
+ {
+ for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
+ {
+ pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
+
+ sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
+ subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
+ addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
+ round_val = vqmovun_s16(addsat_val);
+
+ /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
+
+ pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
+ vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
+
+ vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
+ vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
+ vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
+ vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
+ vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
+ vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
+ vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
+ vst1_u8(pu1_dst_tmp_1, vext_t);
+ pu1_dst_tmp_1 += dst_strd;
+
+ pu1_ref_tmp_1 -= 8;
+ }
+
+ /* loop to store next sets of eight values in the destination */
+
+ for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
+ {
+ pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+
+ vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
+ pu1_dst_tmp_2 += dst_strd;
+ }
+ pu1_ref_tmp_3 += 8;
+ pu1_dst_tmp_3 += 8;
+ pu1_dst_tmp_2 = pu1_dst_tmp_3;
+ cond = 1;
+ }
+ }
+ }
+}
+/* INTRA_PRED_LUMA_VER */
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode2.
+*
+* @par Description:
+* Intraprediction for mode 2 (sw angle) with reference neighboring samples
+* location pointed by 'pu1_ref' to the TU block location pointed by
+* 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 two_nt;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ /* rev_res naming has been made to have the reverse result value in it */
+ /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
+ /* rows and columns are unrolled by 4, when the width is multiple of 4 */
+
+ if(0 != (nt & 7))
+ {
+ UWORD8 *pu1_ref_tmp = pu1_ref;
+ UWORD8 *pu1_dst_tmp = pu1_dst;
+ uint8x8_t pu1_src_val, rev_res;
+ uint64x1_t shift_res;
+
+ for(col = nt; col > 0; col -= 4)
+ {
+ for(row = nt; row > 0; row -= 4)
+ {
+ /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
+
+ pu1_src_val = vld1_u8(pu1_ref_tmp);
+ shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
+ rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
+
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
+ pu1_dst_tmp += dst_strd;
+
+ shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
+ pu1_dst_tmp += dst_strd;
+
+ shift_res = vshr_n_u64(shift_res, 8);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
+ pu1_dst_tmp += dst_strd;
+
+ shift_res = vshr_n_u64(shift_res, 8);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
+ pu1_dst_tmp += dst_strd;
+ }
+ }
+ }
+
+ /* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */
+ /* shift_64 to shift the reversed 2nd values to get the value what we need */
+ /* rows and columns are unrolled by 8, when the width is multiple of 8 */
+
+ else
+ {
+ UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
+ UWORD8 *pu1_dst_tmp = pu1_dst;
+ UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
+
+ uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
+ uint64x1_t shift_val;
+
+ two_nt = 2 * nt;
+ pu1_ref_two_nt_minus2 += (two_nt);
+ pu1_ref_two_nt_minus2 -= 8;
+
+ for(col = nt; col > 0; col -= 8)
+ {
+ for(row = nt; row > 0; row -= 8)
+ {
+ pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
+ rev_val_first = vrev64_u8(pu1_src_val2);
+
+ pu1_ref_two_nt_minus2 -= 8;
+ pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
+ rev_val_second = vrev64_u8(pu1_src_val1);
+
+ vext_t = vext_u8(rev_val_first, rev_val_second, 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+
+ shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
+ vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+
+ shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
+ vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+
+ shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
+ vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+
+ shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
+ vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+
+ shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
+ vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+
+ shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
+ vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+
+ shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
+ vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
+ vst1_u8(pu1_dst_tmp, vext_t);
+ pu1_dst_tmp += dst_strd;
+ }
+ pu1_dst_tmp_plus8 += 8;
+ pu1_dst_tmp = pu1_dst_tmp_plus8;
+ pu1_ref_two_nt_minus2 += (nt - 8);
+ }
+ }
+}
+/* INTRA_PRED_LUMA_MODE2 */
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 18 & mode 34.
+*
+* @par Description:
+* Intraprediction for mode 34 (ne angle) with reference neighboring
+* samples location pointed by 'pu1_ref' to the TU block location pointed by
+* 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col, idx;
+ WORD32 intraPredAngle = 32;
+ WORD32 two_nt;
+ UNUSED(src_strd);
+ two_nt = 2 * nt;
+
+ UWORD8 *pu1_ref_tmp = pu1_ref;
+ UWORD8 *pu1_ref_tmp1 = pu1_ref;
+ UWORD8 *pu1_dst_tmp = pu1_dst;
+ UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
+
+ uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
+
+ /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */
+ /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */
+ /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
+ /* rows and columns are unrolled by 8, when the width is multiple of 8 */
+ /* loops are maintained separately for mode18 and mode34 */
+
+ /* cond to allow multiples of 8 */
+ if(0 == (nt & 7))
+ {
+ if(mode == 34)
+ {
+ pu1_ref_tmp += (two_nt + 2);
+
+ for(row = nt; row > 0; row -= 8)
+ {
+ for(col = nt; col > 0; col -= 8)
+ {
+ /* Loading 1st eight values */
+ src_tmp_1st = vld1_u8(pu1_ref_tmp);
+ pu1_ref_tmp += 8;
+
+ /* Loading next eight values */
+ src_tmp_2nd = vld1_u8(pu1_ref_tmp);
+
+ /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
+ vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
+ vst1_u8(pu1_dst_tmp, src_tmp_1st);
+ pu1_dst_tmp += dst_strd;
+
+ vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
+ vst1_u8(pu1_dst_tmp, vext1);
+ pu1_dst_tmp += dst_strd;
+
+ vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
+ vst1_u8(pu1_dst_tmp, vext2);
+ pu1_dst_tmp += dst_strd;
+
+ vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
+ vst1_u8(pu1_dst_tmp, vext3);
+ pu1_dst_tmp += dst_strd;
+
+ vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
+ vst1_u8(pu1_dst_tmp, vext4);
+ pu1_dst_tmp += dst_strd;
+
+ vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
+ vst1_u8(pu1_dst_tmp, vext5);
+ pu1_dst_tmp += dst_strd;
+
+ vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
+ vst1_u8(pu1_dst_tmp, vext6);
+ pu1_dst_tmp += dst_strd;
+
+ vst1_u8(pu1_dst_tmp, vext7);
+ pu1_dst_tmp += dst_strd;
+ }
+
+ pu1_dst_tmp_plus8 += 8;
+ pu1_dst_tmp = pu1_dst_tmp_plus8;
+ pu1_ref_tmp -= (nt - 8);
+ }
+ }
+ else /* Loop for mode 18 */
+ {
+ pu1_ref_tmp += (two_nt);
+
+ for(row = nt; row > 0; row -= 8)
+ {
+ for(col = nt; col > 0; col -= 8)
+ {
+ /* Loading 1st eight values */
+ src_tmp_1st = vld1_u8(pu1_ref_tmp);
+ pu1_ref_tmp -= 8;
+
+ /* Loading next eight values */
+ src_tmp_2nd = vld1_u8(pu1_ref_tmp);
+
+ /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
+ vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
+ vst1_u8(pu1_dst_tmp, src_tmp_1st);
+ pu1_dst_tmp += dst_strd;
+
+ vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
+ vst1_u8(pu1_dst_tmp, vext1);
+ pu1_dst_tmp += dst_strd;
+
+ vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
+ vst1_u8(pu1_dst_tmp, vext2);
+ pu1_dst_tmp += dst_strd;
+
+ vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
+ vst1_u8(pu1_dst_tmp, vext3);
+ pu1_dst_tmp += dst_strd;
+
+ vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
+ vst1_u8(pu1_dst_tmp, vext4);
+ pu1_dst_tmp += dst_strd;
+
+ vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
+ vst1_u8(pu1_dst_tmp, vext5);
+ pu1_dst_tmp += dst_strd;
+
+ vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
+ vst1_u8(pu1_dst_tmp, vext6);
+ pu1_dst_tmp += dst_strd;
+
+ vst1_u8(pu1_dst_tmp, vext7);
+ pu1_dst_tmp += dst_strd;
+ }
+ pu1_dst_tmp_plus8 += 8;
+ pu1_dst_tmp = pu1_dst_tmp_plus8;
+ pu1_ref_tmp += (nt + 8);
+ }
+ }
+ }
+
+ /* rows and columns are unrolled by 4, when the width is multiple of 4 */
+
+ else /* loop for multiples of 4 */
+ {
+ uint8x8_t src_val1;
+ uint8x8_t src_val2;
+
+ if(mode == 18)
+ intraPredAngle = -32;
+ else if(mode == 34)
+ intraPredAngle = 32;
+
+ for(row = 0; row < nt; row += 2)
+ {
+ /* unrolling 2 rows */
+ idx = ((row + 1) * intraPredAngle) >> 5;
+ pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
+ src_val1 = vld1_u8(pu1_ref_tmp);
+
+ idx = ((row + 2) * intraPredAngle) >> 5;
+ pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
+ src_val2 = vld1_u8(pu1_ref_tmp1);
+
+ /* unrolling 4 col */
+ for(col = nt; col > 0; col -= 4)
+ {
+ pu1_dst_tmp = pu1_dst;
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
+ pu1_dst_tmp += dst_strd;
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
+ pu1_dst += 4;
+ }
+ pu1_dst += 2 * dst_strd - nt;
+ }
+ }
+}
+/* INTRA_PRED_LUMA_MODE_18_34 */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Intra prediction interpolation filter for luma mode 3 to mode 9
+ *
+ * @par Description:
+ * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
+ * reference neighboring samples location pointed by 'pu1_ref' to the TU
+ * block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] nt
+ * integer Transform Block size
+ *
+ * @param[in] mode
+ * integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 intra_pred_ang;
+ WORD32 pos, fract = 100, fract_prev;
+ UNUSED(src_strd);
+ if(0 == (nt & 7))
+ {
+
+ UWORD8 *pu1_ref_main_idx = pu1_ref;
+ UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
+
+ UWORD8 *pu1_dst_tmp1 = pu1_dst;
+ UWORD8 *pu1_dst_tmp2 = pu1_dst;
+
+ WORD32 two_nt = 2 * nt;
+
+ pu1_ref_main_idx += two_nt;
+ pu1_ref_main_idx_1 += two_nt - 1;
+
+ uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
+ uint8x8_t shift_res;
+ uint16x8_t mul_res1, mul_res2, add_res;
+
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ pu1_ref_main_idx -= 8;
+ pu1_ref_main_idx_1 -= 8;
+
+ for(col = 0; col < nt; col++)
+ {
+ fract_prev = fract;
+
+ pos = ((col + 1) * intra_pred_ang);
+ fract = pos & (31);
+
+ if(fract_prev < fract)
+ {
+ pu1_ref_main_idx += 1;
+ pu1_ref_main_idx_1 += 1;
+ }
+
+ dup_const_fract = vdup_n_u8((uint8_t)fract);
+ dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ for(row = nt; row > 0; row -= 8)
+ {
+ ref_main_idx = vld1_u8(pu1_ref_main_idx);
+ ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
+
+ mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+ mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+ pu1_dst_tmp1 += dst_strd;
+
+ pu1_ref_main_idx -= 8;
+ pu1_ref_main_idx_1 -= 8;
+
+ }
+ pu1_dst_tmp2 += 1;
+ pu1_dst_tmp1 = pu1_dst_tmp2;
+
+ pu1_ref_main_idx += nt;
+ pu1_ref_main_idx_1 += nt;
+
+ pu1_ref_main_idx -= 1;
+ pu1_ref_main_idx_1 -= 1;
+
+ }
+ }
+ else
+ {
+ UWORD8 *pu1_ref_tmp1 = pu1_ref;
+ UWORD8 *pu1_ref_tmp2 = pu1_ref;
+ UWORD8 *pu1_dst_tmp1 = pu1_dst;
+ UWORD8 *pu1_dst_tmp2 = pu1_dst;
+
+ pu1_ref_tmp1 += nt;
+ pu1_ref_tmp2 += (nt - 1);
+
+ uint8x8_t dup_fract, dup_32_fract, shift_res;
+ uint16x8_t mul_res1, mul_res2, add_res;
+ uint32x2_t pu1_ref_val1, pu1_ref_val2;
+
+ pu1_ref_val1 = vdup_n_u32(0);
+ pu1_ref_val2 = vdup_n_u32(0);
+
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+
+ for(col = 0; col < nt; col++)
+ {
+ fract_prev = fract;
+ pos = ((col + 1) * intra_pred_ang);
+ fract = pos & (31);
+ if(fract_prev < fract)
+ {
+ pu1_ref_tmp1 += 1;
+ pu1_ref_tmp2 += 1;
+ }
+ dup_fract = vdup_n_u8((uint8_t)fract);
+ dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ for(row = nt; row > 0; row -= 4)
+ {
+ pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
+ pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
+
+ mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
+ mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+
+ }
+ pu1_ref_tmp1 -= 1;
+ pu1_ref_tmp2 -= 1;
+
+ pu1_dst_tmp2 += 1;
+ pu1_dst_tmp1 = pu1_dst_tmp2;
+
+ }
+
+
+ }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Intra prediction interpolation filter for luma mode 11 to mode 17
+ *
+ * @par Description:
+ * Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
+ * with reference neighboring samples location pointed by 'pu1_ref' to the
+ * TU block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] nt
+ * integer Transform Block size
+ *
+ * @param[in] mode
+ * integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col, k;
+ WORD32 two_nt;
+ WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+ WORD32 pos, fract = 1000, fract_prev;
+ WORD32 ref_idx;
+
+ UWORD8 *ref_main;
+ UWORD8 *ref_main_tmp;
+
+ UWORD8 *pu1_ref_tmp1 = pu1_ref;
+ UWORD8 *pu1_ref_tmp2 = pu1_ref;
+ UWORD8 *pu1_dst_tmp1 = pu1_dst;
+ UWORD8 *pu1_dst_tmp2 = pu1_dst;
+
+ UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
+
+ uint16x8_t mul_res1, mul_res2, add_res;
+ uint8x8_t dup_const_fract, dup_const_32_fract;
+ uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
+ uint8x8_t ref_left_t;
+ uint32x2_t ref_left_tmp;
+ UNUSED(src_strd);
+ ref_left_tmp = vdup_n_u32(0);
+
+ inv_ang_sum = 128;
+ two_nt = 2 * nt;
+
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+
+ pu1_ref_tmp1 += two_nt;
+
+ ref_main = ref_temp + (nt - 1);
+ ref_main_tmp = ref_main;
+
+ if(0 == (nt & 7))
+ {
+ pu1_ref_tmp2 += (two_nt - 7);
+
+ for(k = nt - 1; k >= 0; k -= 8)
+ {
+
+ ref_left_t = vld1_u8(pu1_ref_tmp2);
+
+ ref_left_t = vrev64_u8(ref_left_t);
+ vst1_u8(ref_main_tmp, ref_left_t);
+ ref_main_tmp += 8;
+ pu1_ref_tmp2 -= 8;
+
+ }
+
+ }
+ else
+ {
+ uint8x8_t rev_val;
+ pu1_ref_tmp2 += (two_nt - (nt - 1));
+
+ for(k = nt - 1; k >= 0; k -= 8)
+ {
+
+ ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
+
+ rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
+ vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
+
+ }
+
+ }
+
+ ref_main[nt] = pu1_ref[two_nt - nt];
+
+ /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+ ref_idx = (nt * intra_pred_ang) >> 5;
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+ }
+
+ UWORD8 *ref_main_tmp1 = ref_main;
+ UWORD8 *ref_main_tmp2 = ref_main;
+
+ ref_main_tmp2 += 1;
+
+ if(0 == (nt & 7))
+ {
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+ for(col = 0; col < nt; col++)
+ {
+
+ fract_prev = fract;
+ pos = ((col + 1) * intra_pred_ang);
+ fract = pos & (31);
+
+ if(fract_prev < fract)
+ {
+ ref_main_tmp1 -= 1;
+ ref_main_tmp2 -= 1;
+ }
+
+ dup_const_fract = vdup_n_u8((uint8_t)fract);
+ dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ // Do linear filtering
+ for(row = nt; row > 0; row -= 8)
+ {
+ ref_main_idx = vld1_u8(ref_main_tmp1);
+
+ ref_main_idx_1 = vld1_u8(ref_main_tmp2);
+
+ mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+ mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
+ pu1_dst_tmp1 += dst_strd;
+
+ ref_main_tmp1 += 8;
+ ref_main_tmp2 += 8;
+ }
+
+ ref_main_tmp1 -= nt;
+ ref_main_tmp2 -= nt;
+
+ pu1_dst_tmp2 += 1;
+ pu1_dst_tmp1 = pu1_dst_tmp2;
+ }
+ }
+ else
+ {
+ uint32x2_t ref_main_idx1, ref_main_idx2;
+
+ ref_main_idx1 = vdup_n_u32(0);
+ ref_main_idx2 = vdup_n_u32(0);
+
+ for(col = 0; col < nt; col++)
+ {
+ fract_prev = fract;
+ pos = ((col + 1) * intra_pred_ang);
+ fract = pos & (31);
+
+ if(fract_prev < fract)
+ {
+ ref_main_tmp1 -= 1;
+ ref_main_tmp2 -= 1;
+ }
+
+ dup_const_fract = vdup_n_u8((uint8_t)fract);
+ dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ for(row = nt; row > 0; row -= 4)
+ {
+
+ ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
+ ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
+
+ mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
+ mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
+ pu1_dst_tmp1 += dst_strd;
+
+ vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
+ pu1_dst_tmp1 += dst_strd;
+
+ }
+
+ pu1_dst_tmp2 += 1;
+ pu1_dst_tmp1 = pu1_dst_tmp2;
+
+ }
+
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Intra prediction interpolation filter for luma mode 19 to mode 25
+ *
+ * @par Description:
+ * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
+ * reference neighboring samples location pointed by 'pu1_ref' to the TU
+ * block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] nt
+ * integer Transform Block size
+ *
+ * @param[in] mode
+ * integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col, k;
+ WORD32 two_nt, intra_pred_ang;
+ WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
+ WORD32 ref_idx;
+ UWORD8 *ref_main;
+ UWORD8 *ref_main_tmp;
+ UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
+
+ UWORD8 *pu1_ref_tmp1 = pu1_ref;
+ UWORD8 *pu1_ref_tmp2 = pu1_ref;
+ UWORD8 *pu1_dst_tmp1 = pu1_dst;
+
+ uint16x8_t mul_res1, mul_res2, add_res;
+ uint8x8_t dup_const_fract, dup_const_32_fract;
+ uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
+ uint8x8_t ref_above_t;
+ uint32x2_t ref_above_tmp;
+ UNUSED(src_strd);
+ ref_above_tmp = vdup_n_u32(0);
+
+ two_nt = 2 * nt;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ pu1_ref_tmp1 += two_nt;
+
+
+ ref_main = ref_temp + (nt - 1);
+ ref_main_tmp = ref_main;
+
+ if(0 == (nt & 7))
+ {
+ pu1_ref_tmp2 += (two_nt - 7);
+ for(k = nt - 1; k >= 0; k -= 8)
+ {
+
+ ref_above_t = vld1_u8(pu1_ref_tmp1);
+ vst1_u8(ref_main_tmp, ref_above_t);
+ ref_main_tmp += 8;
+ pu1_ref_tmp1 += 8;
+
+ }
+
+ }
+ else
+ {
+ pu1_ref_tmp2 += (two_nt - (nt - 1));
+
+ for(k = nt - 1; k >= 0; k -= 4)
+ {
+
+ ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
+ vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
+
+ }
+
+ }
+
+ ref_main[nt] = pu1_ref[two_nt + nt];
+
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+
+ ref_idx = (nt * intra_pred_ang) >> 5;
+ inv_ang_sum = 128;
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+ }
+
+ UWORD8 *ref_main_tmp1 = ref_main;
+ UWORD8 *ref_main_tmp2 = ref_main;
+
+ ref_main_tmp2 += 1;
+
+ if(0 == (nt & 7))
+ {
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+ for(row = 0; row < nt; row++)
+ {
+
+ fract_prev = fract;
+ pos = ((row + 1) * intra_pred_ang);
+ fract = pos & (31);
+
+ if(fract_prev < fract)
+ {
+ ref_main_tmp1 -= 1;
+ ref_main_tmp2 -= 1;
+ }
+
+ dup_const_fract = vdup_n_u8((uint8_t)fract);
+ dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ // Do linear filtering
+ for(col = nt; col > 0; col -= 8)
+ {
+ ref_main_idx = vld1_u8(ref_main_tmp1);
+
+ ref_main_idx_1 = vld1_u8(ref_main_tmp2);
+
+ mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+ mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_u8(pu1_dst_tmp1, shift_res);
+ pu1_dst_tmp1 += 8;
+
+ ref_main_tmp1 += 8;
+ ref_main_tmp2 += 8;
+ }
+
+ ref_main_tmp1 -= nt;
+ ref_main_tmp2 -= nt;
+
+ pu1_dst_tmp1 += (dst_strd - nt);
+ }
+ }
+ else
+ {
+ uint32x2_t ref_main_idx1, ref_main_idx2;
+
+ ref_main_idx1 = vdup_n_u32(0);
+ ref_main_idx2 = vdup_n_u32(0);
+
+ for(row = 0; row < nt; row++)
+ {
+ fract_prev = fract;
+ pos = ((row + 1) * intra_pred_ang);
+ fract = pos & (31);
+
+ if(fract_prev < fract)
+ {
+ ref_main_tmp1 -= 1;
+ ref_main_tmp2 -= 1;
+ }
+
+ dup_const_fract = vdup_n_u8((uint8_t)fract);
+ dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ for(col = nt; col > 0; col -= 4)
+ {
+
+ ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
+ ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
+
+ mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
+ mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
+ pu1_dst_tmp1 += 4;
+
+ }
+ pu1_dst_tmp1 += (dst_strd - nt);
+ }
+
+ }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Intra prediction interpolation filter for luma mode 27 to mode 33
+ *
+ * @par Description:
+ * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+ * reference neighboring samples location pointed by 'pu1_ref' to the TU
+ * block location pointed by 'pu1_dst'
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[out] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] src_strd
+ * integer source stride
+ *
+ * @param[in] dst_strd
+ * integer destination stride
+ *
+ * @param[in] nt
+ * integer Transform Block size
+ *
+ * @param[in] mode
+ * integer intraprediction mode
+ *
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 intra_pred_ang;
+ WORD32 pos, fract = 0, fract_prev;
+
+ WORD32 two_nt = 2 * nt;
+ UNUSED(src_strd);
+ if(0 == (nt & 7))
+ {
+
+ UWORD8 *pu1_ref_main_idx = pu1_ref;
+ UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
+
+ UWORD8 *pu1_dst_tmp1 = pu1_dst;
+ pu1_ref_main_idx += (two_nt + 1);
+ pu1_ref_main_idx_1 += (two_nt + 2);
+
+ uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
+ uint8x8_t shift_res;
+ uint16x8_t mul_res1, mul_res2, add_res;
+
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ for(row = 0; row < nt; row++)
+ {
+ fract_prev = fract;
+
+ pos = ((row + 1) * intra_pred_ang);
+ fract = pos & (31);
+
+ if(fract_prev > fract)
+ {
+ pu1_ref_main_idx += 1;
+ pu1_ref_main_idx_1 += 1;
+ }
+
+ dup_const_fract = vdup_n_u8((uint8_t)fract);
+ dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ for(col = nt; col > 0; col -= 8)
+ {
+ ref_main_idx = vld1_u8(pu1_ref_main_idx);
+ ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
+
+ mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
+ mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_u8(pu1_dst_tmp1, shift_res);
+ pu1_dst_tmp1 += 8;
+
+ pu1_ref_main_idx += 8;
+ pu1_ref_main_idx_1 += 8;
+ }
+
+ pu1_ref_main_idx -= nt;
+ pu1_ref_main_idx_1 -= nt;
+
+ pu1_dst_tmp1 += (dst_strd - nt);
+ }
+
+ }
+ else
+ {
+ UWORD8 *pu1_ref_tmp1 = pu1_ref;
+ UWORD8 *pu1_ref_tmp2 = pu1_ref;
+ UWORD8 *pu1_dst_tmp1 = pu1_dst;
+
+ pu1_ref_tmp1 += (two_nt + 1);;
+ pu1_ref_tmp2 += (two_nt + 2);;
+
+ uint8x8_t dup_fract, dup_32_fract, shift_res;
+ uint16x8_t mul_res1, mul_res2, add_res;
+ uint32x2_t pu1_ref_val1, pu1_ref_val2;
+
+ pu1_ref_val1 = vdup_n_u32(0);
+ pu1_ref_val2 = vdup_n_u32(0);
+
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ for(row = 0; row < nt; row++)
+ {
+ fract_prev = fract;
+ pos = ((row + 1) * intra_pred_ang);
+ fract = pos & (31);
+ if(fract_prev > fract)
+ {
+ pu1_ref_tmp1 += 1;
+ pu1_ref_tmp2 += 1;
+ }
+ dup_fract = vdup_n_u8((uint8_t)fract);
+ dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
+
+ for(col = nt; col > 0; col -= 4)
+ {
+ pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
+ pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
+
+ mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
+ mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
+
+ add_res = vaddq_u16(mul_res1, mul_res2);
+
+ shift_res = vrshrn_n_u16(add_res, 5);
+
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
+ pu1_dst_tmp1 += 4;
+
+ }
+
+ pu1_dst_tmp1 += (dst_strd - nt);
+
+ }
+
+
+ }
+
+}
diff --git a/common/arm/ihevc_intra_pred_luma_dc.s b/common/arm/ihevc_intra_pred_luma_dc.s
new file mode 100644
index 0000000..f380d94
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_dc.s
@@ -0,0 +1,508 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_filters_dc.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_dc_a9q
+
+.type ihevc_intra_pred_luma_dc_a9q, %function
+
+ihevc_intra_pred_luma_dc_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+
+@********** testing
+ @mov r6, #128
+ @b prologue_cpy_32
+@********** testing
+
+ mov r11, #2 @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val)
+ mov r9, #0
+ vmov d17, r11, r9
+
+ clz r5, r4
+
+ add r6, r0, r4 @&src[nt]
+ rsb r5, r5, #32 @log2nt
+ add r7, r0, r4, lsl #1 @&src[2nt]
+
+ add r8, r7, #1 @&src[2nt+1]
+ mvn r5, r5
+ add r5, r5, #1
+ vdup.32 d8, r5
+
+ ldrb r14, [r8]
+ vshl.i64 d8, d8, #32
+
+ sub r9, r7, #1 @&src[2nt-1]
+ vshr.s64 d8, d8, #32
+
+ mov r7, r8 @r7 also stores 2nt+1
+
+ ldrb r12, [r9]
+ add r14, r14, r12 @src[2nt+1] + src[2nt-1]
+ add r14, r14, r11 @src[2nt+1] + src[2nt-1] + 2
+
+ cmp r4, #4
+ beq dc_4
+
+ mov r10, r4 @nt
+
+add_loop:
+ vld1.s8 d0, [r6]! @load from src[nt]
+ mov r5, #0 @
+ vld1.s8 d1, [r8]! @load from src[2nt+1]
+
+ vpaddl.u8 d2, d0
+
+ vmov d6, r4, r5 @store nt to accumulate
+ vpaddl.u8 d3, d1
+
+ vld1.s8 d0, [r6]! @load from src[nt] (extra load for 8)
+
+ vld1.s8 d1, [r8]! @load from src[2nt+1] (extra load for 8)
+ vadd.u16 d4, d2, d3
+
+
+ vpaddl.u16 d5, d4
+
+
+ vpadal.u32 d6, d5 @accumulate all inp into d6 (end for nt==8)
+
+ subs r10, #8
+ beq epil_add_loop
+
+core_loop_add:
+ vpaddl.u8 d2, d0
+ subs r10, #8
+ vpaddl.u8 d3, d1
+
+
+
+ vadd.u16 d4, d2, d3
+ vld1.s8 d0, [r6]! @load from src[nt] (extra load for 16)
+
+ vpaddl.u16 d5, d4
+ vld1.s8 d1, [r8]! @load from src[2nt+1] (extra load for 16)
+
+ vpadal.u32 d6, d5 @accumulate all inp into d6
+ bne core_loop_add
+
+epil_add_loop:
+
+ vshl.s64 d9, d6, d8 @(dc_val) shr by log2nt+1
+ cmp r4, #32
+
+ vmov d28, r14, r5 @src[2nt+1]+2+src[2nt-1] moved to d28
+ moveq r6, #128
+
+ vdup.8 d16, d9[0] @dc_val
+ vshl.s64 d13, d9, #1 @2*dc
+
+ beq prologue_cpy_32
+
+ vadd.i64 d14, d13, d28 @src[2nt+1]+2+src[2nt-1]+2dc_val
+ movne r6, #0 @nt
+
+ vshr.u16 d15, d14, #2 @final dst[0]'s value in d15[0]
+ movne r10, r4
+
+ vadd.i64 d11, d13, d9 @3*dc
+ sub r12, r3, r3, lsl #3 @-7*strd
+
+ vadd.i64 d11, d11, d17 @3*dc + 2
+ add r12, r12, #8 @offset after one 8x8 block (-7*strd + 8)
+
+ vdup.16 q12, d11[0] @3*dc + 2 (moved to all lanes)
+ sub r0, r3, r4 @strd - nt
+
+prologue_col:
+ @0th column and 0-7 rows done here
+ @r8 and r9 (2nt+1+col 2nt-1-row)
+
+ mov r8, r7 @&src[2nt+1]
+
+ add r0, r0, #8 @strd - nt + 8
+ vld1.s8 d0, [r8]! @col 1::7 load (prol)
+ sub r9, r9, #7 @&src[2nt-1-row]
+
+ vld1.s8 d1, [r9] @row 7::1 (0 also) load (prol)
+ sub r9, r9, #8
+
+ vmovl.u8 q10, d0
+
+ vld1.s8 d6, [r8] @col 8::15 load (prol extra)
+ vadd.i16 q10, q10, q12 @col 1::7 add 3dc+2 (prol)
+
+ vmovl.u8 q11, d1
+ vqshrun.s16 d2, q10, #2 @columns shr2 movn (prol)
+
+ vmovl.u8 q13, d6
+ vadd.i16 q11, q11, q12 @row 1::7 add 3dc+2 (prol)
+
+ vmov.i64 d19, #0x00000000000000ff @
+ vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol)
+
+ vbsl d19, d15, d2 @first row with dst[0]
+ vadd.i16 q13, q13, q12 @col 8::15 add 3dc+2 (prol extra)
+
+ vrev64.8 d3, d3
+
+ vst1.8 d19, [r2], r3 @store row 0 (prol)
+ vshr.s64 d3, d3, #8 @row 0 shift (prol) (first value to be ignored)
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 1 (prol)
+
+loop_again_col_row:
+
+ vbsl d20, d3, d16 @row 1 (prol)
+
+ vmov.i64 d21, #0x00000000000000ff @byte mask row 2 (prol)
+ vshr.s64 d3, d3, #8 @row 1 shift (prol)
+
+ vst1.8 d20, [r2], r3 @store row 1 (prol)
+ vqshrun.s16 d4, q13, #2 @columns shr2 movn (prol extra)
+
+
+ vbsl d21, d3, d16 @row 2 (prol)
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 3 (prol)
+ vshr.s64 d3, d3, #8 @row 2 shift (prol)
+
+ vst1.8 d21, [r2], r3 @store row 2 (prol)
+
+
+ vbsl d20, d3, d16 @row 3 (prol)
+
+ vmov.i64 d21, #0x00000000000000ff @byte mask row 4 (prol)
+ vshr.s64 d3, d3, #8 @row 3 shift (prol)
+
+ vst1.8 d20, [r2], r3 @store row 3 (prol)
+
+
+ vbsl d21, d3, d16 @row 4 (prol)
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 5 (prol)
+ vshr.s64 d3, d3, #8 @row 4 shift (prol)
+
+ vst1.8 d21, [r2], r3 @store row 4 (prol)
+
+
+ vbsl d20, d3, d16 @row 5 (prol)
+
+ vmov.i64 d21, #0x00000000000000ff @byte mask row 6 (prol)
+ vshr.s64 d3, d3, #8 @row 5 shift (prol)
+
+ vst1.8 d20, [r2], r3 @store row 5 (prol)
+
+ vld1.s8 d1, [r9] @row 8::15 load (prol extra)
+
+ vbsl d21, d3, d16 @row 6 (prol)
+
+ vmovl.u8 q11, d1
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 7 (prol)
+ vshr.s64 d3, d3, #8 @row 6 shift (prol)
+
+ vst1.8 d21, [r2], r3 @store row 6 (prol)
+
+ vbsl d20, d3, d16 @row 7 (prol)
+ vadd.i16 q11, q11, q12 @row 8::15 add 3dc+2 (prol extra)
+
+ vshr.s64 d3, d3, #8 @row 7 shift (prol)
+ vst1.8 d20, [r2], r12 @store row 7 (prol)
+
+ subs r10, r10, #8 @counter for cols
+
+ beq end_func
+ blt copy_16
+
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 9 (prol)
+ vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol)
+
+ vrev64.8 d3, d3
+
+ vst1.8 d4, [r2], r3 @store 2nd col (for 16x16)
+
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r0 @go to next row for 16
+
+
+ vbsl d20, d3, d16 @row 9 (prol)
+ subs r10, r10, #8
+
+ vst1.8 d20, [r2], r3 @store row 9 (prol)
+ vshr.s64 d3, d3, #8 @row 9 shift (prol)
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 9 (prol)
+
+ b loop_again_col_row
+
+
+copy_16:
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2], r3
+ vst1.8 d16, [r2]
+
+ b end_func
+
+prologue_cpy_32:
+ mov r9, #128
+ @sub r7, r3, #-24
+ add r5, r2, r3
+ add r8, r5, r3
+ add r10, r8, r3
+ vdup.8 q10, d16[0]
+ lsl r6, r3, #2
+ add r6, r6, #0xfffffff0
+
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d20,d21}, [r2], r6
+ vst1.8 {d20,d21}, [r5], r6
+ vst1.8 {d20,d21}, [r8], r6
+ vst1.8 {d20,d21}, [r10], r6
+
+ sub r9, r9, #32 @32x32 prol/epil counter dec
+
+kernel_copy:
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d20,d21}, [r2], r6
+ vst1.8 {d20,d21}, [r5], r6
+ vst1.8 {d20,d21}, [r8], r6
+ vst1.8 {d20,d21}, [r10], r6
+
+ subs r9, r9, #32
+
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d20,d21}, [r2], r6
+ vst1.8 {d20,d21}, [r5], r6
+ vst1.8 {d20,d21}, [r8], r6
+ vst1.8 {d20,d21}, [r10], r6
+
+ bne kernel_copy
+
+epilogue_copy:
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d20,d21}, [r2]
+ vst1.8 {d20,d21}, [r5]
+ vst1.8 {d20,d21}, [r8]
+ vst1.8 {d20,d21}, [r10]
+
+ b end_func
+
+
+dc_4:
+ vld1.s8 d0, [r6]! @load from src[nt]
+ vld1.s8 d1, [r8]! @load from src[2nt+1]
+
+ vpaddl.u8 d2, d0
+ mov r5, #0 @
+ vmov d6, r4, r5 @store nt to accumulate
+ vpaddl.u8 d3, d1
+
+ vadd.u16 d4, d2, d3
+
+
+ vpaddl.u16 d5, d4
+ vmov.i64 d30, #0x00000000ffffffff
+
+ vand d5, d5, d30
+
+ vmov d28, r14, r5 @src[2nt+1]+2+src[2nt-1] moved to d28
+ vadd.i64 d6, d6, d5 @accumulate all inp into d6 (end for nt==8)
+
+ vshl.s64 d9, d6, d8 @(dc_val) shr by log2nt+1
+ mov r8, r7 @&src[2nt+1]
+
+ vshl.s64 d13, d9, #1 @2*dc
+ sub r9, r9, #3 @&src[2nt-1-row]
+
+ vdup.8 d16, d9[0] @dc_val
+ vadd.i64 d14, d13, d28 @src[2nt+1]+2+src[2nt-1]+2dc_val
+
+ vshr.u16 d15, d14, #2 @final dst[0]'s value in d15[0]
+ sub r12, r3, r3, lsl #2 @-3*strd
+ vadd.i64 d11, d13, d9 @3*dc
+
+ vadd.i64 d11, d11, d17 @3*dc + 2
+ add r12, r12, #4 @offset after one 4x4 block (-3*strd + 4)
+
+ vdup.16 q12, d11[0] @3*dc + 2 (moved to all lanes)
+ sub r0, r3, r4 @strd - nt
+
+
+ vld1.s8 d0, [r8] @col 1::3 load (prol)
+ vld1.s8 d1, [r9] @row 3::1 (0 also) load (prol)
+
+ vmovl.u8 q10, d0
+
+ vmovl.u8 q11, d1
+ vadd.i16 q10, q10, q12 @col 1::7 add 3dc+2 (prol)
+
+ vadd.i16 q11, q11, q12 @row 1::7 add 3dc+2 (prol)
+
+ vmov.i64 d19, #0x00000000000000ff @
+ vqshrun.s16 d2, q10, #2 @columns shr2 movn (prol)
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 1 (prol)
+ vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol)
+
+
+ vbsl d19, d15, d2 @first row with dst[0]
+
+ vrev64.8 d3, d3
+
+ vst1.32 d19[0], [r2], r3 @store row 0 (prol)
+ vshr.s64 d3, d3, #40 @row 0 shift (prol) (first value to be ignored)
+
+ vmov.i64 d21, #0x00000000000000ff @byte mask row 2 (prol)
+
+ vbsl d20, d3, d16 @row 1 (prol)
+ vshr.s64 d3, d3, #8 @row 1 shift (prol)
+
+ vst1.32 d20[0], [r2], r3 @store row 1 (prol)
+
+ vbsl d21, d3, d16 @row 2 (prol)
+
+ vmov.i64 d20, #0x00000000000000ff @byte mask row 3 (prol)
+
+ vshr.s64 d3, d3, #8 @row 2 shift (prol)
+ vst1.32 d21[0], [r2], r3 @store row 2 (prol)
+
+ vbsl d20, d3, d16 @row 3 (prol)
+ vst1.32 d20[0], [r2] @store row 3 (prol)
+
+epilogue_end:
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_luma_horz.s b/common/arm/ihevc_intra_pred_luma_horz.s
new file mode 100644
index 0000000..581b673
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_horz.s
@@ -0,0 +1,339 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_horz_neon.s
+@*
+@* @brief
+@* contains function definition for intra prediction interpolation filters
+@*
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@* - ihevc_intra_pred_luma_horz()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* intra prediction interpolation filter for horizontal luma variable.
+@*
+@* @par description:
+@* horizontal intraprediction(mode 10) with.extern samples location
+@* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer
+@* to section 8.4.4.2.6 in the standard (special case)
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* integer transform block size
+@*
+@* @param[in] mode
+@* integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_horz_a9q
+
+.type ihevc_intra_pred_luma_horz_a9q, %function
+
+ihevc_intra_pred_luma_horz_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ @ldr r5,[sp,#44] @loads mode
+
+ lsl r6,r4,#1 @two_nt
+
+ add r12,r0,r6 @*pu1_ref[two_nt]
+ cmp r4,#4 @if nt == 4
+ beq core_loop_4
+
+ cmp r4,#8 @if nt == 8
+ beq core_loop_8
+
+ cmp r4,#16 @if nt == 16
+ beq core_loop_16
+ sub r12,r12,#16 @move to 16th value pointer
+ add r9,r2,#16
+
+core_loop_32:
+ vld1.8 {q0},[r12] @load 16 values. d1[7] will have the 1st value.
+
+ vdup.8 q1,d1[7] @duplicate the i value.
+
+ vdup.8 q2,d1[6] @duplicate the ii value.
+ vdup.8 q3,d1[5] @duplicate the iii value.
+ vst1.8 {q1},[r2],r3 @store in 1st row 0-16 columns
+ vst1.8 {q1},[r9],r3 @store in 1st row 16-32 columns
+
+ vdup.8 q4,d1[4]
+ vst1.8 {q2},[r2],r3
+ vst1.8 {q2},[r9],r3
+
+ vdup.8 q1,d1[3]
+ vst1.8 {q3},[r2],r3
+ vst1.8 {q3},[r9],r3
+
+ vdup.8 q2,d1[2]
+ vst1.8 {q4},[r2],r3
+ vst1.8 {q4},[r9],r3
+
+ vdup.8 q3,d1[1]
+ vst1.8 {q1},[r2],r3
+ vst1.8 {q1},[r9],r3
+
+ vdup.8 q4,d1[0]
+ vst1.8 {q2},[r2],r3
+ vst1.8 {q2},[r9],r3
+
+ vdup.8 q1,d0[7]
+ vst1.8 {q3},[r2],r3
+ vst1.8 {q3},[r9],r3
+
+ vdup.8 q2,d0[6]
+ vst1.8 {q4},[r2],r3
+ vst1.8 {q4},[r9],r3
+
+ vdup.8 q3,d0[5]
+ vst1.8 {q1},[r2],r3
+ vst1.8 {q1},[r9],r3
+
+ vdup.8 q4,d0[4]
+ vst1.8 {q2},[r2],r3
+ vst1.8 {q2},[r9],r3
+
+ vdup.8 q1,d0[3]
+ vst1.8 {q3},[r2],r3
+ vst1.8 {q3},[r9],r3
+
+ vdup.8 q2,d0[2]
+ vst1.8 {q4},[r2],r3
+ vst1.8 {q4},[r9],r3
+
+ vdup.8 q3,d0[1]
+ vst1.8 {q1},[r2],r3
+ vst1.8 {q1},[r9],r3
+ sub r12,r12,#16 @move to 16th value pointer
+
+ vdup.8 q4,d0[0]
+ vst1.8 {q2},[r2],r3
+ vst1.8 {q2},[r9],r3
+
+ subs r4,r4,#16 @decrement the loop count by 16
+ vst1.8 {q3},[r2],r3
+ vst1.8 {q3},[r9],r3
+
+ vst1.8 {q4},[r2],r3
+ vst1.8 {q4},[r9],r3
+ bgt core_loop_32
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+ b end_func
+
+core_loop_16:
+ ldrb lr,[r12],#1 @pu1_ref[two_nt]
+ vld1.8 {q15},[r12] @pu1_ref[two_nt + 1 + col]
+
+ vdup.8 d28,lr
+ sub r12,r12,#17
+ vld1.8 {q0},[r12]
+ vdup.8 d26,d1[7]
+ vmovl.u8 q13,d26
+
+ vdup.8 q1,d1[6]
+ vsubl.u8 q12,d30,d28
+
+ vdup.8 q2,d1[5]
+ vshr.s16 q12,q12,#1
+
+ vdup.8 q3,d1[4]
+ vqadd.s16 q11,q13,q12
+
+ vdup.8 q4,d1[3]
+ vqmovun.s16 d22,q11
+
+ vst1.8 {d22},[r2]!
+
+ vdup.8 q5,d1[2]
+ vsubl.u8 q12,d31,d28
+
+ vdup.8 q6,d1[1]
+ vshr.s16 q12,q12,#1
+
+ vdup.8 q7,d1[0]
+ vqadd.s16 q11,q13,q12
+
+ vdup.8 q8,d0[7]
+ vqmovun.s16 d22,q11
+
+ vst1.8 {d22},[r2],r3
+ sub r2,r2,#8
+
+ vst1.8 {q1},[r2],r3
+
+ vst1.8 {q2},[r2],r3
+ vst1.8 {q3},[r2],r3
+ vst1.8 {q4},[r2],r3
+
+ vdup.8 q1,d0[6]
+ vst1.8 {q5},[r2],r3
+
+ vdup.8 q2,d0[5]
+ vst1.8 {q6},[r2],r3
+
+ vdup.8 q3,d0[4]
+ vst1.8 {q7},[r2],r3
+
+ vdup.8 q4,d0[3]
+ vst1.8 {q8},[r2],r3
+
+ vdup.8 q5,d0[2]
+ vst1.8 {q1},[r2],r3
+
+ vdup.8 q6,d0[1]
+ vst1.8 {q2},[r2],r3
+
+ vdup.8 q7,d0[0]
+ vst1.8 {q3},[r2],r3
+
+ vst1.8 {q4},[r2],r3
+ vst1.8 {q5},[r2],r3
+ vst1.8 {q6},[r2],r3
+ vst1.8 {q7},[r2],r3
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+ b end_func
+
+
+core_loop_8:
+ ldrb lr,[r12] @pu1_ref[two_nt]
+ add r12,r12,#1 @pu1_ref[two_nt + 1]
+ vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col]
+
+ sub r12,r12,#9
+ vld1.8 {d0},[r12]
+ vdup.8 d26,d0[7]
+ vdup.8 d28,lr
+
+ vdup.8 d3,d0[6]
+ vmovl.u8 q13,d26
+
+ vdup.8 d4,d0[5]
+ vsubl.u8 q12,d30,d28
+
+ vdup.8 d5,d0[4]
+ vshr.s16 q12,q12,#1
+
+ vdup.8 d6,d0[3]
+ vqadd.s16 q11,q13,q12
+
+ vdup.8 d7,d0[2]
+ vqmovun.s16 d22,q11
+
+ vst1.8 {d22},[r2],r3
+ vst1.8 {d3},[r2],r3
+
+ vdup.8 d8,d0[1]
+ vst1.8 {d4},[r2],r3
+ vst1.8 {d5},[r2],r3
+
+ vdup.8 d9,d0[0]
+ vst1.8 {d6},[r2],r3
+ vst1.8 {d7},[r2],r3
+
+ vst1.8 {d8},[r2],r3
+ vst1.8 {d9},[r2],r3
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+ b end_func
+
+
+core_loop_4:
+ ldrb lr,[r12] @pu1_ref[two_nt]
+ add r12,r12,#1 @pu1_ref[two_nt + 1]
+ vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col]
+
+ sub r12,r12,#5
+ vld1.8 {d0},[r12]
+ vdup.8 d28,lr
+ vdup.8 d26,d0[3]
+ vmovl.u8 q13,d26
+
+ vdup.8 d3,d0[2]
+ vsubl.u8 q12,d30,d28
+
+ vdup.8 d4,d0[1]
+ vshr.s16 q12,q12,#1
+
+ vdup.8 d5,d0[0]
+ vqadd.s16 q11,q13,q12
+
+ vqmovun.s16 d22,q11
+
+ vst1.32 {d22[0]},[r2],r3
+ vst1.32 {d3[0]},[r2],r3
+ vst1.32 {d4[0]},[r2],r3
+ vst1.32 {d5[0]},[r2],r3
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+end_func:
+
+
+
diff --git a/common/arm/ihevc_intra_pred_luma_mode2.s b/common/arm/ihevc_intra_pred_luma_mode2.s
new file mode 100644
index 0000000..cf7999b
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode2.s
@@ -0,0 +1,270 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode2_neon.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode2_a9q
+
+.type ihevc_intra_pred_luma_mode2_a9q, %function
+
+ihevc_intra_pred_luma_mode2_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ mov r8,#-2
+
+ cmp r4,#4
+ beq mode2_4
+
+ add r0,r0,r4,lsl #1
+
+ sub r0,r0,#9 @src[1]
+ add r10,r0,#-1
+
+prologue_cpy_32:
+
+ vld1.8 {d0},[r0],r8
+ mov r11,r4
+
+ vld1.8 {d1},[r10],r8
+ mov r6, r2
+
+ vld1.8 {d2},[r0],r8
+ vld1.8 {d3},[r10],r8
+ lsr r1, r4, #3
+
+ vld1.8 {d4},[r0],r8
+ vld1.8 {d5},[r10],r8
+ vld1.8 {d6},[r0],r8
+ mul r1, r4, r1
+
+ vld1.8 {d7},[r10],r8
+ add r7,r6,r3
+
+ vrev64.8 d8,d0
+ vrev64.8 d9,d1
+ lsl r5, r3, #2
+
+ vrev64.8 d10,d2
+ vrev64.8 d11,d3
+ add r9,r7,r3
+
+ vrev64.8 d12,d4
+ subs r1,r1,#8
+
+ vrev64.8 d13,d5
+ vrev64.8 d14,d6
+ vrev64.8 d15,d7
+ add r14,r9,r3
+
+ beq epilogue_mode2
+
+ sub r12,r4,#8
+
+kernel_mode2:
+
+ vst1.8 {d8},[r6],r5
+ vst1.8 {d9},[r7],r5
+ subs r11,r11,#8
+
+ vst1.8 {d10},[r9],r5
+ addgt r2,r2,#8
+
+ vst1.8 {d11},[r14],r5
+ vst1.8 {d12},[r6],r5
+ movle r11,r4
+
+ vst1.8 {d13},[r7],r5
+ vst1.8 {d14},[r9],r5
+ addle r2, r2, r3, lsl #2
+
+ vst1.8 {d15},[r14],r5
+ vld1.8 {d0},[r0],r8
+ sub r14,r4,#8
+
+ vld1.8 {d1},[r10],r8
+ vld1.8 {d2},[r0],r8
+ addle r2, r2, #8
+
+ vld1.8 {d3},[r10],r8
+ vld1.8 {d4},[r0],r8
+ suble r2, r6, r14
+
+ vld1.8 {d5},[r10],r8
+ subs r12,r12,#8
+
+ vld1.8 {d6},[r0],r8
+ mov r6, r2
+
+ vld1.8 {d7},[r10],r8
+ addle r0, r0, r4
+
+ vrev64.8 d8,d0
+ add r7, r6, r3
+
+ vrev64.8 d9,d1
+ suble r0, r0, #8
+
+ vrev64.8 d10,d2
+ movle r12,r4
+
+ vrev64.8 d11,d3
+ add r9, r7, r3
+
+ vrev64.8 d12,d4
+ add r10,r0,#-1
+
+ vrev64.8 d13,d5
+ subs r1, r1, #8
+
+ vrev64.8 d14,d6
+ add r14, r9, r3
+
+ vrev64.8 d15,d7
+
+ bne kernel_mode2
+
+epilogue_mode2:
+
+ vst1.8 {d8},[r6],r5
+ vst1.8 {d9},[r7],r5
+ vst1.8 {d10},[r9],r5
+ vst1.8 {d11},[r14],r5
+ vst1.8 {d12},[r6],r5
+ vst1.8 {d13},[r7],r5
+ vst1.8 {d14},[r9],r5
+ vst1.8 {d15},[r14],r5
+
+ b end_func
+
+mode2_4:
+
+ mov r8,#-2
+ sub r0,r0,#1
+ add r10,r0,#-1
+
+ vld1.8 {d0},[r0],r8
+ add r5,r2,r3
+ vld1.8 {d2},[r10],r8
+ add r6,r5,r3
+ vld1.8 {d4},[r0]
+ add r7,r6,r3
+ vld1.8 {d6},[r10]
+
+ vrev64.8 d1,d0
+ vrev64.8 d3,d2
+
+
+
+ vst1.32 {d1[0]},[r2]
+ vrev64.8 d5,d4
+ vst1.32 {d3[0]},[r5]
+ vrev64.8 d7,d6
+ vst1.32 {d5[0]},[r6]
+ vst1.32 {d7[0]},[r7]
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_luma_mode_18_34.s b/common/arm/ihevc_intra_pred_luma_mode_18_34.s
new file mode 100644
index 0000000..438c0f5
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode_18_34.s
@@ -0,0 +1,273 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode_18_34_neon.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* yogeswaran rs
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_18_34_a9q
+
+.type ihevc_intra_pred_luma_mode_18_34_a9q, %function
+
+ihevc_intra_pred_luma_mode_18_34_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+
+ ldr r4,[sp,#40]
+ ldr r5,[sp,#44]
+
+ cmp r4,#4
+ beq mode2_4
+
+ mov r11,r4
+ mov r12,r4
+ sub r14,r4,#8
+
+ add r0,r0,r4,lsl #1
+
+ cmp r5,#0x22
+ mov r10,r2
+
+ add r0,r0,#2
+ subne r0,r0,#2
+ moveq r6,#1
+ movne r6,#-1
+ mov r8,r0
+
+prologue_cpy_32:
+
+ vld1.8 {d0},[r8],r6
+ lsr r1, r4, #3
+ vld1.8 {d1},[r8],r6
+ mul r1, r4, r1
+ vld1.8 {d2},[r8],r6
+ vld1.8 {d3},[r8],r6
+ subs r1,r1,#8
+ vld1.8 {d4},[r8],r6
+ vld1.8 {d5},[r8],r6
+ vld1.8 {d6},[r8],r6
+
+ vld1.8 {d7},[r8],r6
+
+
+ beq epilogue_mode2
+ sub r11,r11,#8
+
+ cmp r5,#0x22
+ addne r0,r0,#8
+ movne r8,r0
+ bne kernel_mode18
+ @add r8,r0,#8
+
+kernel_mode2:
+ vst1.8 {d0},[r10],r3
+ vst1.8 {d1},[r10],r3
+ subs r12,r12,#8
+ vst1.8 {d2},[r10],r3
+ addne r2,r2,#8
+ vst1.8 {d3},[r10],r3
+
+ vld1.8 {d0},[r8],r6
+ vst1.8 {d4},[r10],r3
+
+ vst1.8 {d5},[r10],r3
+ vld1.8 {d1},[r8],r6
+ vst1.8 {d6},[r10],r3
+ vld1.8 {d2},[r8],r6
+ vst1.8 {d7},[r10],r3
+
+ vld1.8 {d3},[r8],r6
+ subeq r2,r10,r14
+ vld1.8 {d4},[r8],r6
+ mov r10,r2
+ vld1.8 {d5},[r8],r6
+ moveq r12,r4
+ vld1.8 {d6},[r8],r6
+ subs r11,r11,#8
+
+ vld1.8 {d7},[r8],r6
+
+ addeq r0,r0,#8
+ moveq r11,r4
+ moveq r8,r0
+
+ subs r1, r1, #8
+
+ bne kernel_mode2
+
+ b epilogue_mode2
+
+kernel_mode18:
+ vst1.8 {d0},[r10],r3
+ vst1.8 {d1},[r10],r3
+ subs r12,r12,#8
+ vst1.8 {d2},[r10],r3
+ addne r2,r2,#8
+ vst1.8 {d3},[r10],r3
+
+ vld1.8 {d0},[r8],r6
+ vst1.8 {d4},[r10],r3
+
+ vst1.8 {d5},[r10],r3
+ vld1.8 {d1},[r8],r6
+
+ vst1.8 {d6},[r10],r3
+ vld1.8 {d2},[r8],r6
+ vst1.8 {d7},[r10],r3
+
+ vld1.8 {d3},[r8],r6
+ subeq r2,r10,r14
+ vld1.8 {d4},[r8],r6
+ mov r10,r2
+ vld1.8 {d5},[r8],r6
+ moveq r12,r4
+ vld1.8 {d6},[r8],r6
+ subs r11,r11,#8
+ vld1.8 {d7},[r8],r6
+
+ addne r0,r0,#8
+ moveq r11,r4
+ subeq r0,r8,r14
+ subs r1, r1, #8
+ mov r8,r0
+
+ bne kernel_mode18
+
+
+epilogue_mode2:
+
+ vst1.8 {d0},[r10],r3
+ vst1.8 {d1},[r10],r3
+ vst1.8 {d2},[r10],r3
+ vst1.8 {d3},[r10],r3
+ vst1.8 {d4},[r10],r3
+ vst1.8 {d5},[r10],r3
+ vst1.8 {d6},[r10],r3
+ vst1.8 {d7},[r10],r3
+
+ b end_func
+
+mode2_4:
+
+ add r0,r0,#10
+ cmp r5,#0x22
+ subne r0,r0,#2
+
+ moveq r8,#1
+ movne r8,#-1
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0[0]},[r2],r3
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0[0]},[r2],r3
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0[0]},[r2],r3
+
+ vld1.8 {d0},[r0],r8
+ vst1.32 {d0[0]},[r2],r3
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
new file mode 100644
index 0000000..595d82a
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
@@ -0,0 +1,540 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode_27_to_33.s
+@*
+@* @brief
+@* contains function definition for intra prediction interpolation filters
+@*
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@* - ihevc_intra_pred_luma_mode_27_to_33()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* intra prediction interpolation filter for luma mode 27 to mode 33
+@*
+@* @par description:
+@* intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+@* .extern neighboring samples location pointed by 'pu1_ref' to the tu
+@* block location pointed by 'pu1_dst'
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* integer transform block size
+@*
+@* @param[in] mode
+@* integer intraprediction mode
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
+@ word32 src_strd,
+@ uword8 *pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_27_to_33_a9q
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl2 - 8
+
+
+.type ihevc_intra_pred_luma_mode_27_to_33_a9q, %function
+
+ihevc_intra_pred_luma_mode_27_to_33_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r5,[sp,#44] @loads mode
+ ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
+ulbl1:
+ add r6,r6,pc
+
+ lsl r7,r4,#1 @two_nt
+
+ add r8,r6,r5,lsl #2 @*gai4_ihevc_ang_table[mode]
+ ldr r9,[r8] @intra_pred_ang = gai4_ihevc_ang_table[mode]
+ ldr r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
+ulbl2:
+ add r1,r1,pc
+ add r6,r1,#1
+
+ tst r4,#7
+ add r8,r0,r7 @pu1_ref + two_nt
+ mov lr,#0 @row
+ mov r12,r4
+ bne core_loop_4
+
+core_loop_8:
+ add r8,r8,#1 @pu1_ref_main_idx += (two_nt + 1)
+ vdup.8 d0,r9 @intra_pred_ang
+ mov r12,r4,lsr #3 @divide by 8
+
+ vmov.i8 d1,#32
+ mul r7,r4,r12
+
+ vmov.i16 q3,#31
+ @lsl r12,r3,#3
+
+ mov r1,r8
+ @sub r12,r12,r4
+ mov r5,r4
+ mov r11,#1
+
+prologue:
+ vld1.8 {d3},[r6] @loads the row value
+ vmull.u8 q1,d3,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.u16 d5,q1,#5 @idx = pos >> 5
+
+ vdup.8 d31,d4[0]
+ add r0,r2,r3
+
+ vmov.u32 lr,d5[0] @(i row)extract idx to the r register
+
+ vdup.8 d29,d4[1] @(ii)
+ and r9,lr,#0xff @(i row) get the last byte
+
+ add r10,r8,r9 @(i row)*pu1_ref[ref_main_idx]
+
+ asr lr,lr,#8 @(ii)shift by 8
+ vld1.8 {d8},[r10],r11 @(i row)ref_main_idx
+ and r9,lr,#0xff @(ii)get the last byte
+
+ asr lr,lr,#8 @(iii)
+ vld1.8 {d9},[r10] @(i row)ref_main_idx_1
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ and r9,lr,#0xff @(iii)
+ vsub.u8 d30,d1,d31 @32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr lr,lr,#8 @(iv)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ and r9,lr,#0xff @(iv)
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+
+ vdup.8 d31,d4[4] @(v)
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vmov.u32 lr,d5[1] @extract idx to the r register
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d10},[r2]! @(i row)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ and r9,lr,#0xff @(v)
+ vdup.8 d29,d4[5] @(vi)
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+
+ asr lr,lr,#8 @(vi)
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and r9,lr,#0xff @(vi)
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+ vdup.8 d27,d4[6] @(vii)
+ asr lr,lr,#8 @(vii)
+
+ and r9,lr,#0xff @(vii)
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ asr lr,lr,#8 @(viii)
+ vdup.8 d25,d4[7] @(viii)
+ and r9,lr,#0xff @(viii)
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs r4,r4,#8
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ addgt r8,r8,#8
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subgt r7,r7,#8
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmull.u8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ vmovn.i16 d4,q2
+ vshrn.u16 d3,q1,#5 @idx = pos >> 5
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+ and r9,lr,#0xff @(i)
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ asr lr,lr,#8 @(ii)
+ vdup.8 d31,d4[0]
+ subs r4,r4,#8
+
+ vld1.8 {d8},[r10],r11 @(i)ref_main_idx
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+ and r9,lr,#0xff @(ii)
+ addle r6,r6,#8 @increment the row value
+
+ vld1.8 {d9},[r10] @(i)ref_main_idx_1
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(ii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d5},[r6] @loads the row value
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr lr,lr,#8 @(iii)
+
+ vdup.8 d29,d4[1] @(ii)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+ and r9,lr,#0xff @(iii)
+
+ vst1.8 {d14},[r0],r3 @(vi)
+ vsub.u8 d30,d1,d31 @(i)32-fract(dup_const_32_fract)
+ add r10,r8,r9 @(iii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(ii)ref_main_idx
+ vmull.u8 q5,d8,d30 @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ asr lr,lr,#8 @(iv)
+
+ vld1.8 {d13},[r12] @(ii)ref_main_idx_1
+ vmlal.u8 q5,d9,d31 @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and r9,lr,#0xff @(iv)
+
+ vmov.u32 lr,d3[1] @extract idx to the r register
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d27,d4[2] @(iii)
+ vsub.u8 d28,d1,d29 @(ii)32-fract(dup_const_32_fract)
+ movle r4,r5 @reload nt
+
+ vld1.8 {d16},[r10],r11 @(iii)ref_main_idx
+ vmull.u8 q7,d12,d28 @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r12,r8,r9 @(iv)*pu1_ref[ref_main_idx]
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vmlal.u8 q7,d13,d29 @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.8 {d17},[r10] @(iii)ref_main_idx_1
+ vrshrn.i16 d10,q5,#5 @(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vdup.8 d25,d4[3] @(iv)
+ vmull.u8 q1,d5,d0 @pos = ((row + 1) * intra_pred_ang)
+
+ vst1.8 {d22},[r0] @(viii)
+ vsub.u8 d26,d1,d27 @(iii)32-fract(dup_const_32_fract)
+
+ vld1.8 {d20},[r12],r11 @(iv)ref_main_idx
+ vmull.u8 q9,d16,d26 @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add r0,r2,r3
+
+ vld1.8 {d21},[r12] @(iv)ref_main_idx_1
+ vmlal.u8 q9,d17,d27 @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and r9,lr,#0xff @(v)
+
+ vdup.8 d31,d4[4] @(v)
+ vrshrn.i16 d14,q7,#5 @(ii)shift_res = vrshrn_n_u16(add_res, 5)
+ add r10,r8,r9 @(v)*pu1_ref[ref_main_idx]
+
+ vst1.8 {d10},[r2]! @(i)
+ vsub.u8 d24,d1,d25 @(iv)32-fract(dup_const_32_fract)
+ asr lr,lr,#8 @(vi)
+
+ vdup.8 d29,d4[5] @(vi)
+ vmull.u8 q11,d20,d24 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and r9,lr,#0xff @(vi)
+
+ vdup.8 d27,d4[6] @(vii)
+ vmlal.u8 q11,d21,d25 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add r12,r8,r9 @(vi)*pu1_ref[ref_main_idx]
+
+ vdup.8 d25,d4[7] @(viii)
+ vrshrn.i16 d18,q9,#5 @(iii)shift_res = vrshrn_n_u16(add_res, 5)
+ asr lr,lr,#8 @(vii)
+
+ vld1.8 {d8},[r10],r11 @(v)ref_main_idx
+ vand q2,q1,q3 @dup_const_fract(fract = pos & (31))
+ and r9,lr,#0xff @(vii)
+
+ vld1.8 {d9},[r10] @(v)ref_main_idx_1
+ vshrn.u16 d3,q1,#5 @idx = pos >> 5
+ asr lr,lr,#8 @(viii)
+
+ vst1.8 {d14},[r0],r3 @(ii)
+ vrshrn.i16 d22,q11,#5 @(iv)shift_res = vrshrn_n_u16(add_res, 5)
+ add r10,r8,r9 @(vii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d12},[r12],r11 @(vi)ref_main_idx
+ vsub.u8 d30,d1,d31 @(v)32-fract(dup_const_32_fract)
+ and r9,lr,#0xff @(viii)
+
+ vld1.8 {d13},[r12] @(vi)ref_main_idx_1
+ vmull.u8 q5,d8,d30 @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ vmov.u32 lr,d3[0] @(i)extract idx to the r register
+ vmlal.u8 q5,d9,d31 @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add r12,r8,r9 @(viii)*pu1_ref[ref_main_idx]
+
+ vld1.8 {d16},[r10],r11 @(vii)ref_main_idx
+ vsub.u8 d28,d1,d29 @(vi)32-fract(dup_const_32_fract)
+
+ vst1.8 {d18},[r0],r3 @(iii)
+ vmull.u8 q7,d12,d28 @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+ movle r8,r1 @reload the source to pu1_src+2nt
+
+ vld1.8 {d17},[r10] @(vii)ref_main_idx_1
+ vmlal.u8 q7,d13,d29 @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ addgt r8,r8,#8 @increment the source next set 8 columns in same row
+
+ vld1.8 {d20},[r12],r11 @(viii)ref_main_idx
+ vrshrn.i16 d10,q5,#5 @(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.8 {d21},[r12] @(viii)ref_main_idx_1
+ vsub.u8 d26,d1,d27 @(vii)32-fract(dup_const_32_fract)
+ lslle r12,r3,#3
+
+ vst1.8 {d22},[r0],r3 @(iv)
+ vmull.u8 q9,d16,d26 @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ suble r12,r12,r5
+
+ vst1.8 {d10},[r0],r3 @(v)
+ vmlal.u8 q9,d17,d27 @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ addle r2,r2,r12 @increment the dst pointer to 8*dst_strd - nt
+
+ vmovn.i16 d4,q2
+ vrshrn.i16 d14,q7,#5 @(vi)shift_res = vrshrn_n_u16(add_res, 5)
+ and r9,lr,#0xff @(i)
+
+ subs r7,r7,#8
+ add r10,r8,r9 @(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ vst1.8 {d14},[r0],r3 @(vi)
+ vrshrn.i16 d18,q9,#5 @(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vsub.u8 d24,d1,d25 @(viii)32-fract(dup_const_32_fract)
+ vmull.u8 q11,d20,d24 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ vmlal.u8 q11,d21,d25 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.8 {d18},[r0],r3 @(vii)
+ vrshrn.i16 d22,q11,#5 @(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ vst1.8 {d22},[r0],r3 @(viii)
+ b end_loops
+
+core_loop_4:
+ add r10,r8,#1 @pu1_ref_main_idx += (two_nt + 1)
+ add r11,r8,#2 @pu1_ref_main_idx_1 += (two_nt + 2)
+ mov r8,#0
+
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#1 @pu1_ref_main_idx += 1
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+ vdup.8 d0,r5 @dup_const_fract
+ rsb r4,r5,#32
+ vdup.8 d1,r4 @dup_const_32_fract
+
+@inner_loop_4
+ vld1.32 {d2[0]},[r10] @ref_main_idx
+ add r8,r8,#1
+ mov lr,r5 @fract_prev = fract
+
+ vld1.32 {d3[0]},[r11] @ref_main_idx_1
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#1 @pu1_ref_main_idx += 1
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d6,r5 @dup_const_fract
+ vmull.u8 q2,d2,d1 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d7,r4 @dup_const_32_fract
+ vmlal.u8 q2,d3,d0 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.32 {d8[0]},[r10] @ref_main_idx
+ add r8,r8,#1
+
+ vld1.32 {d9[0]},[r11] @ref_main_idx_1
+ vrshrn.i16 d4,q2,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ mov lr,r5 @fract_prev = fract
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#1 @pu1_ref_main_idx += 1
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d12,r5 @dup_const_fract
+ vmull.u8 q5,d8,d7 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d13,r4 @dup_const_32_fract
+ vmlal.u8 q5,d9,d6 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.32 {d14[0]},[r10] @ref_main_idx
+ add r8,r8,#1
+
+ vst1.32 {d4[0]},[r2],r3
+ vrshrn.i16 d10,q5,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ vld1.32 {d15[0]},[r11] @ref_main_idx_1
+ mov lr,r5 @fract_prev = fract
+ add r5,r8,#1 @row + 1
+ mul r5,r5,r9 @pos = ((row + 1) * intra_pred_ang)
+ and r5,r5,#31 @fract = pos & (31)
+ cmp lr,r5 @if(fract_prev > fract)
+ addgt r10,r10,#1 @pu1_ref_main_idx += 1
+ add r11,r10,#1 @pu1_ref_main_idx_1 += 1
+
+ vdup.8 d18,r5 @dup_const_fract
+ vmull.u8 q8,d14,d13 @vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ rsb r4,r5,#32
+ vdup.8 d19,r4 @dup_const_32_fract
+ vmlal.u8 q8,d15,d12 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vld1.32 {d20[0]},[r10] @ref_main_idx
+
+ vst1.32 {d10[0]},[r2],r3
+ vrshrn.i16 d16,q8,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+ vld1.32 {d21[0]},[r11] @ref_main_idx_1
+
+ vmull.u8 q11,d20,d19 @vmull_u8(ref_main_idx, dup_const_32_fract)
+ vmlal.u8 q11,d21,d18 @vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ vst1.32 {d16[0]},[r2],r3
+ vrshrn.i16 d22,q11,#5 @shift_res = vrshrn_n_u16(add_res, 5)
+
+ vst1.32 {d22[0]},[r2],r3
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
diff --git a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
new file mode 100644
index 0000000..a8e93c8
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
@@ -0,0 +1,573 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_luma_mode_3_to_9.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_mode_3_to_9_a9q
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_neg_idx_3_9
+
+gai4_ihevc_ang_table_addr:
+.long gai4_ihevc_ang_table - ulbl1 - 8
+
+gai4_ihevc_inv_ang_table_addr:
+.long gai4_ihevc_inv_ang_table - ulbl2 - 8
+
+idx_neg_idx_3_9_addr_1:
+.long idx_neg_idx_3_9 - ulbl3_1 - 8
+
+idx_neg_idx_3_9_addr_2:
+.long idx_neg_idx_3_9 - ulbl3_2 - 8
+
+col_for_intra_luma_addr_1:
+.long col_for_intra_luma - ulbl4_1 - 8
+
+col_for_intra_luma_addr_2:
+.long col_for_intra_luma - ulbl4_2 - 8
+
+col_for_intra_luma_addr_3:
+.long col_for_intra_luma - ulbl4_3 - 8
+
+.type ihevc_intra_pred_luma_mode_3_to_9_a9q, %function
+
+ihevc_intra_pred_luma_mode_3_to_9_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r7, gai4_ihevc_ang_table_addr
+ulbl1:
+ add r7,r7,pc
+
+ ldr r5,[sp,#44] @mode (3 to 9)
+ ldr r8, gai4_ihevc_inv_ang_table_addr
+ulbl2:
+ add r8,r8,pc
+
+ add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode]
+ ldr r7, [r7] @intra_pred_ang
+ vdup.8 d30, r7 @intra_pred_ang
+
+ ldr r14, col_for_intra_luma_addr_1
+ulbl4_1:
+ add r14,r14,pc
+ cmp r4, #4
+
+ beq sz_4_proc
+ b prologue_8_16_32
+
+prologue_8_16_32:
+ lsr r10, r4, #3
+ vld1.8 d31, [r14]!
+ mul r10, r4, r10 @block counter (dec by #8)
+
+ mov r11, r4 @col counter to be inc/dec by #8
+ vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+
+ sub r7, r5, #3
+ vmov.i8 d2,#1 @contains #1 for adding to get ref_main_idx + 1
+ ldr r12, idx_neg_idx_3_9_addr_1 @load least idx table
+ulbl3_1:
+ add r12,r12,pc
+
+ vmov.i8 d3, #2
+
+ add r12, r12, r7, lsl #4
+ mov r8, r12
+
+ mov r7, #8
+ sub r7, r7, r3, lsl #3 @r7 = 8-8r3
+
+ ldr r9, [r8]
+ add r1, r0, r4, lsl #1 @pu1_ref + nt
+
+ vmovn.s16 d6, q11
+ vdup.8 d26, r9 @least idx added to final idx values
+ sub r1, r1, #9 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+ sub r6, r1, r9
+
+ vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+ vshr.s16 q11, q11, #5
+
+ vmov.i8 d29, #31 @contains #31 for vand operation
+
+ vmov.i8 d28, #32
+
+ vqmovn.s16 d8, q11
+
+ vand d6, d6, d29 @fract values in d1/ idx values in d0
+
+ mov r0, #1
+
+ vmov.i8 d27, #7 @row 0 to 7
+
+ vsub.s8 d8, d8, d2 @ref_main_idx (sub row)
+ vsub.s8 d8, d26, d8 @ref_main_idx (row 0)
+ vadd.s8 d8, d8, d27 @t0 compensate the pu1_src idx incremented by 8
+ vsub.s8 d9, d8, d2 @ref_main_idx + 1 (row 0)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0)
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0)
+ vsub.s8 d4, d8, d2 @ref_main_idx (row 1)
+ vsub.s8 d5, d9, d2 @ref_main_idx + 1 (row 1)
+
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1)
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1)
+ vsub.s8 d8, d8, d3 @ref_main_idx (row 2)
+ vsub.s8 d9, d9, d3 @ref_main_idx + 1 (row 2)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2)
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2)
+ vsub.s8 d4, d4, d3 @ref_main_idx (row 3)
+ vsub.s8 d5, d5, d3 @ref_main_idx + 1 (row 3)
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3)
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3)
+ vsub.s8 d8, d8, d3 @ref_main_idx (row 4)
+ vsub.s8 d9, d9, d3 @ref_main_idx + 1 (row 4)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4)
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4)
+ vsub.s8 d4, d4, d3 @ref_main_idx (row 5)
+ vsub.s8 d5, d5, d3 @ref_main_idx + 1 (row 5)
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 5)
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5)
+ vsub.s8 d8, d8, d3 @ref_main_idx (row 6)
+ vsub.s8 d9, d9, d3 @ref_main_idx + 1 (row 6)
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6)
+ vmull.u8 q11, d16, d7 @mul (row 5)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6)
+ vsub.s8 d4, d4, d3 @ref_main_idx (row 7)
+ vsub.s8 d5, d5, d3 @ref_main_idx + 1 (row 7)
+
+ vst1.8 d24, [r2], r3 @st (row 4)
+ vrshrn.i16 d22, q11, #5 @round shft (row 5)
+
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7)
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7)
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d22, [r2], r3 @st (row 5)
+ vrshrn.i16 d20, q10, #5 @round shft (row 6)
+ vrshrn.i16 d18, q9, #5 @round shft (row 7)
+
+ vst1.8 d20, [r2], r3 @st (row 6)
+
+ subs r10, r10, #8 @subtract 8 and go to end if 8x8
+
+ vst1.8 d18, [r2], r3 @st (row 7)
+
+ beq end_func
+
+ subs r11, r11, #8
+ addgt r8, r8, #4
+ addgt r2, r2, r7
+ movle r8, r12
+ suble r2, r2, r4
+ addle r2, r2, #8
+ movle r11, r4
+ ldrle r14, col_for_intra_luma_addr_2
+ulbl4_2:
+ addle r14,r14,pc
+ addle r0, r0, #8
+
+ mov r5,r2
+ vld1.8 d31, [r14]!
+ vmull.s8 q6, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ vmovn.s16 d10, q6
+ vshr.s16 q6, q6, #5
+ vqmovn.s16 d11, q6
+ ldr r9, [r8]
+ add r9, r0, r9
+ sub r9, r9, #1
+ vdup.8 d26, r9
+ vmov.i8 d16,#8
+
+ sub r4,r4,#8
+
+kernel_8_16_32:
+
+ vsub.s8 d8, d26, d11 @ref_main_idx
+ vmov d26,d10
+
+ subs r11, r11, #8
+ sub r6, r1, r9
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7)
+ vadd.s8 d8, d8, d16 @to compensate the pu1_src idx incremented by 8
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx - 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vsub.s8 d9, d8, d2 @ref_main_idx - 1
+ addle r0, r0, #8
+ addgt r8, r8, #4
+ vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+
+ vst1.8 d24, [r5], r3 @st (row 4)
+ vrshrn.i16 d22, q11, #5 @round shft (row 5)
+
+ ldrle r14, col_for_intra_luma_addr_3
+ulbl4_3:
+ addle r14,r14,pc
+
+ movle r8, r12
+ vdup.8 d27, r0 @row value inc or reset accordingly
+
+ vsub.s8 d4, d8, d2 @ref_main_idx (row 1)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0)
+ vsub.s8 d5, d9, d2 @ref_main_idx - 1 (row 1)
+
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vld1.8 d31, [r14]!
+ vand d6, d29, d26 @fract values in d1/ idx values in d0
+
+ vst1.8 d22, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vsub.s8 d8, d8, d3 @ref_main_idx (row 2)
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 1)
+ vsub.s8 d9, d9, d3 @ref_main_idx - 1 (row 2)
+
+ addle r11, r4, #8
+ ldr r9, [r8]
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vsub.s8 d4, d4, d3 @ref_main_idx (row 3)
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 2)
+ vsub.s8 d5, d5, d3 @ref_main_idx - 1 (row 3)
+
+ vmull.u8 q11, d10, d7 @mul (row 1)
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vrshrn.i16 d24, q12, #5 @round shft (row 0)
+ vst1.8 d18, [r5], r3 @(from previous loop)st (row 7)
+
+ vsub.s8 d8, d8, d3 @ref_main_idx (row 4)
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 3)
+ vsub.s8 d9, d9, d3 @ref_main_idx - 1 (row 4)
+
+ vmull.u8 q10, d14, d7 @mul (row 2)
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3)
+ vmlal.u8 q10, d15, d6 @mul (row 2)
+
+ vmull.s8 q7, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ add r5,r2,r3,lsl#2
+ add r9, r0, r9
+
+ vst1.8 d24, [r2], r3 @st (row 0)
+ vrshrn.i16 d22, q11, #5 @round shft (row 1)
+
+ vsub.s8 d4, d4, d3 @ref_main_idx (row 5)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 4)
+ vsub.s8 d5, d5, d3 @ref_main_idx - 1 (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 3)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 4)
+ vmlal.u8 q9, d11, d6 @mul (row 3)
+
+ vst1.8 d22, [r2], r3 @st (row 1)
+ vrshrn.i16 d20, q10, #5 @round shft (row 2)
+
+ vmovn.s16 d10, q7
+ vshr.s16 q7, q7, #5
+
+ vsub.s8 d8, d8, d3 @ref_main_idx (row 6)
+ vtbl.8 d21, {d0,d1}, d4 @load from ref_main_idx (row 5)
+ vsub.s8 d9, d9, d3 @ref_main_idx - 1 (row 6)
+
+ vmull.u8 q12, d12, d7 @mul (row 4)
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 5)
+ vmlal.u8 q12, d13, d6 @mul (row 4)
+
+ vst1.8 d20, [r2], r3 @st (row 2)
+ vrshrn.i16 d18, q9, #5 @round shft (row 3)
+
+ sub r9, r9, #1
+ vqmovn.s16 d11, q7
+
+ vsub.s8 d4, d4, d3 @ref_main_idx (row 7)
+ vtbl.8 d14, {d0,d1}, d8 @load from ref_main_idx (row 6)
+ vsub.s8 d5, d5, d3 @ref_main_idx - 1 (row 7)
+
+ vmull.u8 q11, d21, d7 @mul (row 5)
+ vtbl.8 d15, {d0,d1}, d9 @load from ref_main_idx + 1 (row 6)
+ vmlal.u8 q11, d17, d6 @mul (row 5)
+
+ vadd.s8 d11, d27, d11 @ref_main_idx (add row)
+ vdup.8 d26, r9
+
+ vst1.8 d18, [r2], r3 @st (row 3)
+ vrshrn.i16 d24, q12, #5 @round shft (row 4)
+
+ add r2,r3, lsl #2
+ vsub.s8 d11, d11, d2 @ref_main_idx -1 (sub 1)
+ addgt r2, r7, r2
+
+ suble r2, r2, r4
+
+ subs r10, r10, #8 @subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+
+epil_8_16_32:
+ vtbl.8 d10, {d0,d1}, d4 @load from ref_main_idx (row 7)
+
+ vmull.u8 q10, d14, d7 @mul (row 6)
+ vtbl.8 d11, {d0,d1}, d5 @load from ref_main_idx + 1 (row 7)
+ vmlal.u8 q10, d15, d6 @mul (row 6)
+
+ vst1.8 d24, [r5], r3 @st (row 4)
+ vrshrn.i16 d24, q11, #5 @round shft (row 5)
+
+ vmull.u8 q9, d10, d7 @mul (row 7)
+ vmlal.u8 q9, d11, d6 @mul (row 7)
+
+ vst1.8 d24, [r5], r3 @(from previous loop)st (row 5)
+ vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6)
+
+ vst1.8 d20, [r5], r3 @(from previous loop)st (row 6)
+ vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7)
+
+ vst1.8 d18, [r5], r3 @st (row 7)
+
+ b end_func
+
+sz_4_proc:
+ vld1.8 d31, [r14]
+ vmov.i8 d2, #1 @contains #1 for adding to get ref_main_idx - 1
+
+ vmov.i8 d3, #2
+ ldr r12, idx_neg_idx_3_9_addr_2 @load least idx table
+ulbl3_2:
+ add r12,r12,pc
+
+ vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col)
+ sub r7, r5, #3
+
+ add r12, r12, r7, lsl #4
+ mov r8, r12
+
+ ldr r9, [r8]
+
+ vdup.8 d26, r9 @least idx added to final idx values
+ add r6, r0, r4, lsl #1 @pu1_ref + 2nt
+
+ vmovn.s16 d6, q11
+ sub r6, r6, #9 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+ sub r6, r6, r9
+
+ vld1.8 {d0,d1}, [r6] @stores the 32 values reqd based on indices values (from least idx)
+
+ vmov.i8 d29, #31 @contains #31 for vand operation
+
+ vmov.i8 d28, #32
+
+ vshr.s16 q11, q11, #5
+ vqmovn.s16 d8, q11
+
+ vand d6, d6, d29 @fract values in d1/ idx values in d0
+ vsub.s8 d7, d28, d6 @32-fract
+
+ vmov.i8 d27, #7 @row 0 to 7(row-1)
+ vsub.s8 d8, d8, d2 @ref_main_idx (add 1)
+ vsub.s8 d8, d26, d8 @ref_main_idx
+ vadd.s8 d8, d8, d27 @t0 compensate the pu1_src idx incremented by 8
+ vsub.s8 d9, d8, d2 @ref_main_idx - 1
+
+ vsub.s8 d4, d8, d2 @row 1 ref_main_idx
+ vsub.s8 d5, d9, d2
+
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 0)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 0)
+
+
+ vmull.u8 q12, d12, d7 @mul (row 0)
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 1)
+ vmlal.u8 q12, d13, d6 @mul (row 0)
+
+ vsub.s8 d8, d8, d3 @idx (row 2)
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 1)
+ vsub.s8 d9, d9, d3 @idx+1 (row 2)
+
+ vmull.u8 q11, d16, d7 @mul (row 1)
+ vtbl.8 d12, {d0,d1}, d8 @load from ref_main_idx (row 2)
+ vmlal.u8 q11, d17, d6 @mul (row 1)
+
+ vrshrn.i16 d24, q12, #5 @round shift (row 0)
+
+ vsub.s8 d4, d4, d3 @idx (row 3)
+ vtbl.8 d13, {d0,d1}, d9 @load from ref_main_idx + 1 (row 2)
+ vsub.s8 d5, d5, d3 @idx+1 (row 3)
+
+ vmull.u8 q10, d12, d7 @mul (row 2)
+ vtbl.8 d16, {d0,d1}, d4 @load from ref_main_idx (row 3)
+ vmlal.u8 q10, d13, d6 @mul (row 2)
+
+ vst1.32 d24[0], [r2], r3 @st row 0
+ vrshrn.i16 d22, q11, #5 @round shift (row 1)
+
+ vtbl.8 d17, {d0,d1}, d5 @load from ref_main_idx + 1 (row 3)
+
+ vmull.u8 q9, d16, d7 @mul (row 3)
+ vmlal.u8 q9, d17, d6 @mul (row 3)
+
+ vst1.32 d22[0], [r2], r3 @st row 1
+ vrshrn.i16 d20, q10, #5 @round shift (row 2)
+
+ vst1.32 d20[0], [r2], r3 @st row 2
+
+ vrshrn.i16 d18, q9, #5 @round shift (row 3)
+
+ vst1.32 d18[0], [r2], r3 @st (row 3)
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_luma_planar.s b/common/arm/ihevc_intra_pred_luma_planar.s
new file mode 100644
index 0000000..666798e
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_planar.s
@@ -0,0 +1,557 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_filters_planar.s
+@*
+@* @brief
+@* contains function definitions for inter prediction interpolation.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for planar input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] pi1_coeff
+@* word8 pointer to the planar coefficients
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode,
+@ word32 pi1_coeff)
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+@ pi1_coeff
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_planar_a9q
+.extern gau1_ihevc_planar_factor
+.extern gau1_ihevc_planar_factor_1
+
+gau1_ihevc_planar_factor_addr:
+.long gau1_ihevc_planar_factor - ulbl1 - 8
+
+gau1_ihevc_planar_factor_1_addr:
+.long gau1_ihevc_planar_factor_1 - ulbl2 - 8
+
+
+.type ihevc_intra_pred_luma_planar_a9q, %function
+
+ihevc_intra_pred_luma_planar_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+ ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
+ulbl1:
+ add r11,r11,pc
+
+ clz r5, r4
+ rsb r5, r5, #32
+ vdup.16 q7, r5
+ vneg.s16 q7, q7 @shr value (so vneg)
+ vdup.8 d2, r4 @nt
+ vdup.s16 q8, r4 @nt
+
+ sub r6, r4, #1 @nt-1
+ add r6, r6, r0
+ ldr r7, [r6]
+ vdup.s8 d0, r7 @src[nt-1]
+
+ add r6, r4, r4,lsl #1 @3nt
+ add r6, r6, #1 @3nt + 1
+ add r6, r6, r0
+ ldr r7, [r6]
+ vdup.s8 d1, r7 @src[3nt+1]
+
+ add r6, r4, r4 @2nt
+ add r14, r6, #1 @2nt+1
+ sub r6, r6, #1 @2nt-1
+ add r6, r6, r0 @&src[2nt-1]
+ add r14, r14, r0 @&src[2nt+1]
+
+ mov r8, #1 @row+1 (row is first 0)
+ sub r9, r4, r8 @nt-1-row (row is first 0)
+
+ vdup.s8 d5, r8 @row + 1
+ vdup.s8 d6, r9 @nt - 1 - row
+ vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+ add r12, r11, #1 @coeffs (to be reloaded after every row)
+ mov r1, r4 @nt (row counter) (dec after every row)
+ mov r5, r2 @dst (to be reloaded after every row and inc by dst_strd)
+ mov r10, #8 @increment for the coeffs
+ mov r0, r14 @&src[2nt+1] (to be reloaded after every row)
+
+ cmp r4, #4
+ beq tf_sz_4
+
+@@ ========== ***************** =====================
+prolog:
+tf_sz_8_16_32:
+
+ mov r7, r4 @column counter (set to no of cols)
+ mov r9, r4, lsr #3 @divide nt by 8
+ mul r7, r7, r9 @multiply width * height
+ ldr r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs
+ulbl2:
+ add r5,r5,pc
+ sub r6, r6, #7
+ mov r8, r2
+ lsl r9, r3, #3 @4*stride
+ rsb r9, r9, #8 @8-4*stride
+ mov r10, r4 @nt
+ sub r10, r10, #8 @nt - 8
+
+col_loop_8_16_32:
+
+ vld1.s8 d8, [r12] @(1-8)load 8 coeffs [col+1]
+ vdup.16 q6, r4 @(1)
+ vld1.s8 d4, [r6] @(1-8)src[2nt-1-row]
+ vsub.s8 d9, d2, d8 @(1-8)[nt-1-col]
+
+
+ vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1]
+
+ vld1.s8 d3, [r14] @(1-8)load 8 src[2nt+1+col]
+ vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1]
+
+ vdup.s8 d20, d4[7] @(1)
+ vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col]
+
+ vdup.s8 d21, d4[6] @(2)
+ vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row]
+
+ vdup.16 q15, r4 @(2)
+ vadd.s8 d5, d5, d7 @(1)
+
+ vsub.s8 d6, d6, d7 @(1)
+
+ vdup.s8 d22, d4[5] @(3)
+ vmlal.u8 q15, d5, d0 @(2)
+
+ vdup.16 q14, r4 @(3)
+ vmlal.u8 q15, d8, d1 @(2)
+
+ vmlal.u8 q15, d6, d3 @(2)
+ vmlal.u8 q15, d9, d21 @(2)
+
+ vshl.s16 q6, q6, q7 @(1)shr
+
+ vadd.s8 d5, d5, d7 @(2)
+ vsub.s8 d6, d6, d7 @(2)
+
+ vmovn.i16 d12, q6 @(1)
+ vmlal.u8 q14, d5, d0 @(3)
+
+ vdup.8 d23, d4[4] @(4)
+ vmlal.u8 q14, d8, d1 @(3)
+
+ vdup.16 q5, r4 @(4)
+ vmlal.u8 q14, d6, d3 @(3)
+
+ vst1.s8 d12, [r2], r3 @(1)str 8 values
+ vmlal.u8 q14, d9, d22 @(3)
+
+ vshl.s16 q15, q15, q7 @(2)shr
+
+ vadd.s8 d5, d5, d7 @(3)
+ vsub.s8 d6, d6, d7 @(3)
+
+ vmovn.i16 d30, q15 @(2)
+ vmlal.u8 q5, d5, d0 @(4)
+
+ vdup.8 d20, d4[3] @(5)
+ vmlal.u8 q5, d8, d1 @(4)
+
+ vdup.16 q8, r4 @(5)
+ vmlal.u8 q5, d6, d3 @(4)
+
+ vst1.s8 d30, [r2], r3 @(2)str 8 values
+ vmlal.u8 q5, d9, d23 @(4)
+
+ vshl.s16 q14, q14, q7 @(3)shr
+
+ vadd.s8 d5, d5, d7 @(4)
+ vsub.s8 d6, d6, d7 @(4)
+
+ vmovn.i16 d28, q14 @(3)
+ vmlal.u8 q8, d5, d0 @(5)
+
+ vdup.8 d21, d4[2] @(6)
+ vmlal.u8 q8, d8, d1 @(5)
+
+ vdup.16 q9, r4 @(6)
+ vmlal.u8 q8, d6, d3 @(5)
+
+ vst1.s8 d28, [r2], r3 @(3)str 8 values
+ vmlal.u8 q8, d9, d20 @(5)
+
+ vshl.s16 q5, q5, q7 @(4)shr
+ vadd.s8 d5, d5, d7 @(5)
+ vsub.s8 d6, d6, d7 @(5)
+
+ vmovn.i16 d10, q5 @(4)
+ vmlal.u8 q9, d5, d0 @(6)
+
+ vdup.8 d22, d4[1] @(7)
+ vmlal.u8 q9, d8, d1 @(6)
+
+ vdup.16 q13, r4 @(7)
+ vmlal.u8 q9, d6, d3 @(6)
+
+ vst1.s8 d10, [r2], r3 @(4)str 8 values
+ vmlal.u8 q9, d9, d21 @(6)
+
+ vshl.s16 q8, q8, q7 @(5)shr
+
+ vadd.s8 d5, d5, d7 @(6)
+ vsub.s8 d6, d6, d7 @(6)
+
+ vmovn.i16 d16, q8 @(5)
+ vmlal.u8 q13, d5, d0 @(7)
+
+ vdup.8 d23, d4[0] @(8)
+ vmlal.u8 q13, d8, d1 @(7)
+
+ vdup.16 q12, r4 @(8)
+ vmlal.u8 q13, d6, d3 @(7)
+
+ vst1.s8 d16, [r2], r3 @(5)str 8 values
+ vmlal.u8 q13, d9, d22 @(7)
+
+ vshl.s16 q9, q9, q7 @(6)shr
+
+ vadd.s8 d5, d5, d7 @(7)
+ vsub.s8 d6, d6, d7 @(7)
+
+ vmovn.i16 d18, q9 @(6)
+ vmlal.u8 q12, d5, d0 @(8)
+
+
+ vmlal.u8 q12, d8, d1 @(8)
+
+ vmlal.u8 q12, d6, d3 @(8)
+
+ vst1.s8 d18, [r2], r3 @(6)str 8 values
+ vmlal.u8 q12, d9, d23 @(8)
+
+ vshl.s16 q13, q13, q7 @(7)shr
+
+ subs r7, r7, #8
+
+ beq epilog
+
+ subs r1, r1, #8 @row counter
+ addgt r12, r12, #8 @col inc
+ addgt r14, r14, #8 @also for col inc
+ movle r1, r4 @nt reloaded (refresh the value)
+ addle r12, r11, #1 @r12 reset
+
+ movle r14, r0 @r14 reset
+ vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1]
+
+ suble r6, r6, #8 @for next set of rows
+ vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col]
+
+ addle r5, r5, #8
+ vdup.16 q6, r4 @(1n)(1)
+
+ vld1.s8 d5, [r5]
+
+ vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row]
+ vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col]
+
+ vdup.s8 d20, d4[7] @(1n)(1)
+ vsub.s8 d6, d2, d5
+
+ beq epilog
+
+kernel_plnr:
+
+ cmp r1, #0 @ (cond loop)
+ vshl.s16 q12, q12, q7 @(8)shr
+
+ vmovn.i16 d26, q13 @(7)
+ vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1]
+
+ vmovn.i16 d24, q12 @(8)
+ vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1]
+
+ vdup.s8 d21, d4[6] @(2)
+ vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col]
+
+ vdup.16 q15, r4 @(2)
+ vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row]
+
+ vst1.s8 d26, [r2], r3 @(7)str 8 values
+ vadd.s8 d5, d5, d7 @(1)
+
+ vst1.s8 d24, [r2], r3 @(8)str 8 values
+ vsub.s8 d6, d6, d7 @(1)
+
+ addgt r2, r2, r9 @since more cols to fill, dst + 8 - 6*strd (cond loop)
+ vmlal.u8 q15, d5, d0 @(2)
+
+ suble r2, r2, r10 @else go to next set of rows, dst - (nt-8) (cond loop)
+ vmlal.u8 q15, d8, d1 @(2)
+
+ vdup.s8 d22, d4[5] @(3)
+ vmlal.u8 q15, d6, d3 @(2)
+
+ vdup.16 q14, r4 @(3)
+ vmlal.u8 q15, d9, d21 @(2)
+
+ vshl.s16 q6, q6, q7 @(1)shr
+
+ vadd.s8 d5, d5, d7 @(2)
+ movle r1, r4 @nt reloaded (refresh the value) (cond loop)
+
+ vsub.s8 d6, d6, d7 @(2)
+ subs r1, r1, #8 @row counter (loop)
+
+ vmovn.i16 d12, q6 @(1)
+ vmlal.u8 q14, d5, d0 @(3)
+
+ vdup.8 d23, d4[4] @(4)
+ vmlal.u8 q14, d8, d1 @(3)
+
+ vdup.16 q5, r4 @(4)
+ vmlal.u8 q14, d6, d3 @(3)
+
+ vst1.s8 d12, [r2], r3 @(1)str 8 values
+ vmlal.u8 q14, d9, d22 @(3)
+
+ vshl.s16 q15, q15, q7 @(2)shr
+
+ vadd.s8 d5, d5, d7 @(3)
+
+ vsub.s8 d6, d6, d7 @(3)
+
+ vmovn.i16 d30, q15 @(2)
+ vmlal.u8 q5, d5, d0 @(4)
+
+ vdup.8 d20, d4[3] @(5)
+ vmlal.u8 q5, d8, d1 @(4)
+
+ vdup.16 q8, r4 @(5)
+ vmlal.u8 q5, d6, d3 @(4)
+
+ vst1.s8 d30, [r2], r3 @(2)str 8 values
+ vmlal.u8 q5, d9, d23 @(4)
+
+ vshl.s16 q14, q14, q7 @(3)shr
+
+ vadd.s8 d5, d5, d7 @(4)
+
+ vsub.s8 d6, d6, d7 @(4)
+
+ vmovn.i16 d28, q14 @(3)
+ vmlal.u8 q8, d5, d0 @(5)
+
+ vdup.8 d21, d4[2] @(6)
+ vmlal.u8 q8, d8, d1 @(5)
+
+ vdup.16 q9, r4 @(6)
+ vmlal.u8 q8, d6, d3 @(5)
+
+ vst1.s8 d28, [r2], r3 @(3)str 8 values
+ vmlal.u8 q8, d9, d20 @(5)
+
+ addle r12, r11, #1 @r12 reset (cond loop)
+ vshl.s16 q5, q5, q7 @(4)shr
+
+ addgt r12, r12, #8 @col inc (cond loop)
+ vadd.s8 d5, d5, d7 @(5)
+
+ addgt r14, r14, #8 @also for col inc (cond loop)
+ vsub.s8 d6, d6, d7 @(5)
+
+ vmovn.i16 d10, q5 @(4)
+ vmlal.u8 q9, d5, d0 @(6)
+
+ vdup.8 d22, d4[1] @(7)
+ vmlal.u8 q9, d8, d1 @(6)
+
+ vdup.16 q13, r4 @(7)
+ vmlal.u8 q9, d6, d3 @(6)
+
+ vst1.s8 d10, [r2], r3 @(4)str 8 values
+ vmlal.u8 q9, d9, d21 @(6)
+
+ movle r14, r0 @r14 reset (cond loop)
+ vshl.s16 q8, q8, q7 @(5)shr
+
+ suble r6, r6, #8 @for next set of rows (cond loop)
+ vadd.s8 d5, d5, d7 @(6)
+
+ addle r5, r5, #8 @ (cond loop)
+ vsub.s8 d6, d6, d7 @(6)
+
+ vmovn.i16 d16, q8 @(5)
+ vmlal.u8 q13, d5, d0 @(7)
+
+ vdup.8 d23, d4[0] @(8)
+ vmlal.u8 q13, d8, d1 @(7)
+
+ vdup.16 q12, r4 @(8)
+ vmlal.u8 q13, d6, d3 @(7)
+
+ vst1.s8 d16, [r2], r3 @(5)str 8 values
+ vmlal.u8 q13, d9, d22 @(7)
+
+ vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row]
+ vshl.s16 q9, q9, q7 @(6)shr
+
+ vadd.s8 d5, d5, d7 @(7)
+
+ vsub.s8 d6, d6, d7 @(7)
+
+ vmovn.i16 d18, q9 @(6)
+ vmlal.u8 q12, d5, d0 @(8)
+
+ vld1.s8 d5, [r5] @(row+1 value)
+ vmlal.u8 q12, d8, d1 @(8)
+
+ vdup.s8 d20, d4[7] @(1n)(1)
+ vmlal.u8 q12, d6, d3 @(8)
+
+ vst1.s8 d18, [r2], r3 @(6)str 8 values
+ vmlal.u8 q12, d9, d23 @(8)
+
+ vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1]
+ vsub.s8 d6, d2, d5 @(nt-1-row) value
+
+ subs r7, r7, #8 @col counter
+
+ vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col]
+ vshl.s16 q13, q13, q7 @(7)shr
+
+ vdup.16 q6, r4 @(1n)(1)
+ vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col]
+
+ bne kernel_plnr
+
+epilog:
+
+ vmovn.i16 d26, q13 @(7)
+ vst1.s8 d26, [r2], r3 @(7)str 8 values
+
+ vshl.s16 q12, q12, q7 @(8)shr
+ vmovn.i16 d24, q12 @(8)
+ vst1.s8 d24, [r2], r3 @(8)str 8 values
+
+@@ ========== ***************** =====================
+
+ beq end_loop
+
+tf_sz_4:
+ vld1.s8 d10, [r14] @load src[2nt+1+col]
+ vld1.s8 d8, [r12], r10 @load 8 coeffs [col+1]
+loop_sz_4:
+ mov r10, #4 @reduce inc to #4 for 4x4
+ ldr r7, [r6], #-1 @src[2nt-1-row] (dec to take into account row)
+ vdup.s8 d4, r7 @src[2nt-1-row]
+
+ vsub.s8 d9, d2, d8 @[nt-1-col]
+
+ vmull.u8 q6, d5, d0 @(row+1) * src[nt-1]
+ vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col]
+ vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1]
+ vmlal.u8 q6, d9, d4 @(nt-1-col) * src[2nt-1-row]
+@ vadd.i16 q6, q6, q8 @add (nt)
+@ vshl.s16 q6, q6, q7 @shr
+@ vmovn.i16 d12, q6
+ vrshrn.s16 d12,q6,#3
+ vst1.s32 {d12[0]}, [r2], r3
+
+ vadd.s8 d5, d5, d7 @row++ [(row+1)++]
+ vsub.s8 d6, d6, d7 @[nt-1-row]--
+ subs r1, r1, #1
+
+ bne loop_sz_4
+
+end_loop:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_intra_pred_luma_vert.s b/common/arm/ihevc_intra_pred_luma_vert.s
new file mode 100644
index 0000000..5eeaeb3
--- /dev/null
+++ b/common/arm/ihevc_intra_pred_luma_vert.s
@@ -0,0 +1,421 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_intra_pred_filters_vert.s
+@*
+@* @brief
+@* contains function definitions for intra prediction dc filtering.
+@* functions are coded using neon intrinsics and can be compiled using
+
+@* rvct
+@*
+@* @author
+@* akshaya mukund
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* luma intraprediction filter for dc input
+@*
+@* @par description:
+@*
+@* @param[in] pu1_ref
+@* uword8 pointer to the source
+@*
+@* @param[out] pu1_dst
+@* uword8 pointer to the destination
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] dst_strd
+@* integer destination stride
+@*
+@* @param[in] nt
+@* size of tranform block
+@*
+@* @param[in] mode
+@* type of filtering
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_intra_pred_luma_ver(uword8* pu1_ref,
+@ word32 src_strd,
+@ uword8* pu1_dst,
+@ word32 dst_strd,
+@ word32 nt,
+@ word32 mode)
+@
+@**************variables vs registers*****************************************
+@r0 => *pu1_ref
+@r1 => src_strd
+@r2 => *pu1_dst
+@r3 => dst_strd
+
+@stack contents from #40
+@ nt
+@ mode
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_intra_pred_luma_ver_a9q
+
+.type ihevc_intra_pred_luma_ver_a9q, %function
+
+ihevc_intra_pred_luma_ver_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @loads nt
+
+ lsl r5, r4, #1 @2nt
+
+ cmp r4, #16
+ beq blk_16
+ blt blk_4_8
+
+ add r5, r5, #1 @2nt+1
+ add r6, r0, r5 @&src[2nt+1]
+
+copy_32:
+ add r5, r2, r3
+ vld1.8 {d20,d21}, [r6]! @16 loads (col 0:15)
+ add r8, r5, r3
+
+ add r10, r8, r3
+ vld1.8 {d22,d23}, [r6] @16 loads (col 16:31)
+ lsl r11, r3, #2
+
+ add r11, r11, #0xfffffff0
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d22,d23}, [r2], r11
+ vst1.8 {d22,d23}, [r5], r11
+ vst1.8 {d22,d23}, [r8], r11
+ vst1.8 {d22,d23}, [r10], r11
+
+ subs r4, r4, #8
+
+kernel_copy_32:
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d22,d23}, [r2], r11
+ vst1.8 {d22,d23}, [r5], r11
+ vst1.8 {d22,d23}, [r8], r11
+ vst1.8 {d22,d23}, [r10], r11
+
+ subs r4, r4, #8
+
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d22,d23}, [r2], r11
+ vst1.8 {d22,d23}, [r5], r11
+ vst1.8 {d22,d23}, [r8], r11
+ vst1.8 {d22,d23}, [r10], r11
+
+ bne kernel_copy_32
+
+ vst1.8 {d20,d21}, [r2]!
+ vst1.8 {d20,d21}, [r5]!
+ vst1.8 {d20,d21}, [r8]!
+ vst1.8 {d20,d21}, [r10]!
+
+ vst1.8 {d22,d23}, [r2], r11
+ vst1.8 {d22,d23}, [r5], r11
+ vst1.8 {d22,d23}, [r8], r11
+ vst1.8 {d22,d23}, [r10], r11
+
+ b end_func
+
+blk_16:
+ add r6, r0, r5 @&src[2nt]
+
+ ldrb r11, [r6], #1 @src[2nt]
+
+ vdup.8 q11, r11 @src[2nt]
+ ldrb r12, [r6] @src[2nt+1]
+
+ vld1.8 {d16,d17}, [r6] @ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores)
+ add r6, r6, #0xffffffef @subtract -9 to take it to src[2nt-1-row(15)]
+
+ vdup.8 q12, r12 @src[2nt+1]
+ vdup.16 q15, r12
+ lsl r5, r3, #3 @8*stride
+
+ vld1.8 {d26,d27}, [r6]! @load src[2nt-1-row](rows 0:15)
+ add r5, r2, r5 @r5 ->
+
+ vmov.i64 d18, #0x00000000000000ff
+ vhsub.u8 q13, q13, q11 @(src[2nt-1-row] - src[2nt])>>1
+ @vsubl.u8 q0, d26, d22
+ @vsubl.u8 q14, d27, d22
+
+ @vshr.s16 q0, q0, #1
+ @vshr.s16 q14, q14, #1
+
+ vmov.i64 d19, d17
+ @vaddl.s8 q0, d24, d26
+ vmovl.s8 q0, d26
+ vmovl.s8 q14, d27
+ vqadd.s16 q0, q0, q15
+ vqadd.s16 q14, q14, q15
+
+ vmov.i64 d10, #0x00000000000000ff
+ @vaddl.s8 q1, d25, d27
+
+ vqmovun.s16 d25, q0
+ vqmovun.s16 d24, q14
+ @vmovn.u16 d25, q0
+ @vmovn.u16 d24, q1
+
+
+ vrev64.8 q12, q12
+
+ vmov.i64 d11, d17
+
+ vbsl d18, d24, d16 @only select row values from q12(predpixel)
+ vbsl d10, d25, d16
+
+ vmov.i64 d8, #0x00000000000000ff
+ vmov.i64 d9, d17
+
+ vmov.i64 d6, #0x00000000000000ff
+ vmov.i64 d7, d17
+
+ vst1.8 {d18,d19}, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vst1.8 {d10,d11}, [r5], r3
+ vshr.s64 d25, d25, #8
+
+
+ vbsl d8, d24, d16
+ vbsl d6, d25, d16
+
+ vst1.8 {d8,d9}, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vst1.8 {d6,d7}, [r5], r3
+ vshr.s64 d25, d25, #8
+
+ subs r4, #8
+
+ vmov.i64 d18, #0x00000000000000ff
+ @vmov.i64 d19, d17
+
+ vmov.i64 d10, #0x00000000000000ff
+ @vmov.i64 d11, d17
+
+
+loop_16:
+
+
+ vmov.i64 d8, #0x00000000000000ff
+
+ vmov.i64 d6, #0x00000000000000ff
+
+ vbsl d18, d24, d16 @only select row values from q12(predpixel)
+ vbsl d10, d25, d16
+
+ vst1.8 {d18,d19}, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vst1.8 {d10,d11}, [r5], r3
+ vshr.s64 d25, d25, #8
+
+ vmov.i64 d18, #0x00000000000000ff
+
+ vmov.i64 d10, #0x00000000000000ff
+
+ vbsl d8, d24, d16
+ vbsl d6, d25, d16
+
+ vst1.8 {d8,d9}, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vst1.8 {d6,d7}, [r5], r3
+ vshr.s64 d25, d25, #8
+
+ subs r4, r4, #4
+
+ bne loop_16
+
+ vmov.i64 d8, #0x00000000000000ff
+
+ vmov.i64 d6, #0x00000000000000ff
+
+ vbsl d18, d24, d16 @only select row values from q12(predpixel)
+ vbsl d10, d25, d16
+
+ vst1.8 {d18,d19}, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vst1.8 {d10,d11}, [r5], r3
+ vshr.s64 d25, d25, #8
+
+ vbsl d8, d24, d16
+ vbsl d6, d25, d16
+
+ vst1.8 {d8,d9}, [r2], r3
+
+ vst1.8 {d6,d7}, [r5], r3
+
+ b end_func
+
+
+blk_4_8:
+ vmov.i64 d11, #0x00000000000000ff
+ add r6, r0, r5 @&src[2nt]
+
+ vmov.i64 d10, #0x00000000000000ff
+ ldrb r11, [r6], #1 @src[2nt]
+
+ vdup.8 d22, r11 @src[2nt]
+ ldrb r12, [r6] @src[2nt+1]
+
+ vld1.8 d16, [r6] @ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st)
+ add r6, r6, #0xfffffff7 @subtract -9 to take it to src[2nt-1-row(15)]
+
+ vdup.8 d24, r12 @src[2nt+1]
+ vdup.16 q15, r12
+
+ vld1.8 d26, [r6]! @load src[2nt-1-row](rows 0:15)
+
+ vmov.i64 d18, #0x00000000000000ff
+ vhsub.u8 d26, d26, d22 @(src[2nt-1-row] - src[2nt])>>1
+ @vsubl.u8 q13, d26, d22
+
+ @vshr.s16 q13, q13, #1
+
+ vmov.i64 d19, #0x00000000000000ff
+ vmovl.s8 q13, d26
+ @vaddl.s8 q0, d24, d26
+ vqadd.s16 q0, q13, q15
+
+ vqmovun.s16 d24, q0
+ @vmovn.s16 d24, q0
+
+ vrev64.8 d24, d24
+
+ cmp r4, #4
+ beq blk_4
+
+ vbsl d18, d24, d16 @only select row values from q12(predpixel)
+
+ vst1.8 d18, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vmov.i64 d18, #0x00000000000000ff
+
+ vbsl d19, d24, d16
+
+ vst1.8 d19, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vmov.i64 d19, #0x00000000000000ff
+
+ vbsl d10, d24, d16
+
+ vst1.8 d10, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vmov.i64 d10, #0x00000000000000ff
+
+ vbsl d11, d24, d16
+
+ vst1.8 d11, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vmov.i64 d11, #0x00000000000000ff
+
+ vbsl d18, d24, d16 @only select row values from q12(predpixel)
+
+ vst1.8 d18, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vbsl d19, d24, d16
+
+ vst1.8 d19, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vbsl d10, d24, d16
+
+ vst1.8 d10, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vbsl d11, d24, d16
+
+ vst1.8 d11, [r2], r3
+ vshr.s64 d24, d24, #8
+
+ b end_func
+
+
+blk_4:
+ vbsl d18, d24, d16 @only select row values from q12(predpixel)
+
+ vst1.32 d18[0], [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vbsl d19, d24, d16
+
+ vst1.32 d19[0], [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vbsl d10, d24, d16
+
+ vst1.32 d10[0], [r2], r3
+ vshr.s64 d24, d24, #8
+
+ vbsl d11, d24, d16
+ vst1.32 d11[0], [r2], r3
+
+
+end_func:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
diff --git a/common/arm/ihevc_intra_ref_substitution_a9q.c b/common/arm/ihevc_intra_ref_substitution_a9q.c
new file mode 100644
index 0000000..e100893
--- /dev/null
+++ b/common/arm/ihevc_intra_ref_substitution_a9q.c
@@ -0,0 +1,777 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_intra_ref_substitution.c
+*
+* @brief
+* Contains ref substitution functions
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_macros.h"
+
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+/**
+*******************************************************************************
+*
+* @brief
+* Reference substitution process for samples unavailable for prediction
+* Refer to section 8.4.4.2.2
+*
+* @par Description:
+*
+*
+* @param[in] pu1_top_left
+* UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+* UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+* UWORD8 pointer to the left
+*
+* @param[in] src_strd
+* WORD32 Source stride
+*
+* @param[in] nbr_flags
+* WORD32 neighbor availability flags
+*
+* @param[in] nt
+* WORD32 transform Block size
+*
+* @param[in] dst_strd
+* WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd)
+{
+ UWORD8 pu1_ref_u, pu1_ref_v;
+ WORD32 dc_val, i, j;
+ WORD32 total_samples = (4 * nt) + 1;
+ WORD32 get_bits;
+ WORD32 next;
+ WORD32 bot_left, left, top, tp_right, tp_left;
+ WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+ WORD32 a_nbr_flag[5];
+ UNUSED(dst_strd);
+ /* Neighbor Flag Structure*/
+ /* WORD32 nbr_flags MSB-->LSB TOP LEFT | TOP-RIGHT | TOP | LEFT | BOTTOM LEFT*/
+ /* (1 bit) (4 bits) (4 bits) (4 bits) (4 bits) */
+
+ if(nbr_flags == 0)
+ {
+/* If no neighbor flags are present, fill the neighbor samples with DC value */
+ /*dc_val = 1 << (BIT_DEPTH - 1);*/
+ dc_val = 1 << (8 - 1);
+ for(i = 0; i < (2 * total_samples); i++)
+ {
+ pu1_dst[i] = dc_val;
+ }
+ }
+ else
+ {
+ /* Else fill the corresponding samples */
+
+ /* Check for the neighbors availibility */
+ tp_left = (nbr_flags & 0x10000);
+ tp_right = (nbr_flags & 0x0f000);
+ top = (nbr_flags & 0x00f00);
+ left = (nbr_flags & 0x000f0);
+ bot_left = (nbr_flags & 0x0000f);
+
+ /* Fill nbrs depending on avalibility */
+ /* Top -Left nbrs */
+ if(0 != tp_left)
+ {
+ pu1_dst[(4 * nt)] = *pu1_top_left; // U top-left sample
+ pu1_dst[(4 * nt) + 1] = *(pu1_top_left + 1); // V top-left sample
+ }
+ /* Left nbrs */
+ if(0 != left)
+ {
+ for(i = 0, j = 0; i < (2 * nt); i += 2)
+ {
+ pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+ pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+ j++;
+ }
+ }
+ /* Bottom - Left nbrs */
+ if(0 != bot_left)
+ {
+ for(i = (2 * nt), j = nt; i < (4 * nt); i += 2)
+ {
+ pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+ pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+ j++;
+ }
+ }
+ /* Top nbrs */
+ if(0 != top)
+ {
+ ihevc_memcpy_mul_8_a9q(&pu1_dst[(4 * nt) + 2], pu1_top, 2 * nt);
+ // U-V interleaved Top-top right samples
+ }
+
+ /* Top - Right nbrs */
+ if(0 != tp_right)
+ {
+ ihevc_memcpy_mul_8_a9q(&pu1_dst[(4 * nt) + 2 + 2 * nt], pu1_top + 2 * nt, 2 * nt);
+ // U-V interleaved Top-top right samples
+ }
+
+ if(nt == 4)
+ {
+ /* 1 bit extraction for all the neighboring blocks */
+ tp_left = (nbr_flags & 0x10000) >> 16;
+ bot_left = (nbr_flags & 0x8) >> 3;
+ left = (nbr_flags & 0x80) >> 7;
+ top = (nbr_flags & 0x100) >> 8;
+ tp_right = (nbr_flags & 0x1000) >> 12;
+
+ next = 1;
+ a_nbr_flag[0] = bot_left;
+ a_nbr_flag[1] = left;
+ a_nbr_flag[2] = tp_left;
+ a_nbr_flag[3] = top;
+ a_nbr_flag[4] = tp_right;
+
+ /* If bottom -left is not available, reverse substitution process*/
+ if(bot_left == 0)
+ {
+ /* Check for the 1st available sample from bottom-left*/
+ while(!a_nbr_flag[next])
+ next++;
+
+ /* If Left, top-left are available*/
+ if(next <= 2)
+ {
+ UWORD16 *pu2_dst;
+ idx = (nt * next);
+ pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+ ihevc_memset_16bit_a9q((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+ }
+ else /* If top, top-right are available */
+ {
+ UWORD16 *pu2_dst;
+ /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+ idx = (nt * (next - 1)) + 1;
+ pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+ ihevc_memset_16bit_a9q((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+ }
+ }
+
+ if(left == 0)
+ {
+ UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(2 * nt) - 2];
+ ihevc_memset_16bit_a9q((UWORD16 *)&pu1_dst[(2 * nt)], pu2_dst[0], nt);
+
+
+ }
+ if(tp_left == 0)
+ {
+ pu1_dst[4 * nt] = pu1_dst[(4 * nt) - 2];
+ pu1_dst[(4 * nt) + 1] = pu1_dst[(4 * nt) - 1];
+ }
+ if(top == 0)
+ {
+ UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(4 * nt)];
+ ihevc_memset_16bit_a9q((UWORD16 *)&pu1_dst[(4 * nt) + 2], pu2_dst[0], nt);
+
+
+ }
+ if(tp_right == 0)
+ {
+ UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(6 * nt)];
+ ihevc_memset_16bit_a9q((UWORD16 *)&pu1_dst[(6 * nt) + 2], pu2_dst[0], nt);
+
+
+ }
+ }
+ else if(nt == 8)
+ {
+ WORD32 nbr_flags_temp = 0;
+ nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+ + ((nbr_flags & 0x300) >> 4)
+ + ((nbr_flags & 0x3000) >> 6)
+ + ((nbr_flags & 0x10000) >> 8);
+
+ /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 4; /* for bottom left and left */
+ if(nbr_id_from_bl == 32)
+ nbr_id_from_bl = 16;
+ if(nbr_id_from_bl == 16)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags_temp >> 8) & 0x1))
+ {
+ nbr_id_from_bl++;
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4; /* top and top right; 8 pels per nbr bit */
+
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+ pu1_ref_v = pu1_dst[(2 * nbr_id_from_bl) + 1];
+ for(i = 2 * (nbr_id_from_bl - 1); i >= 0; i -= 2)
+ {
+ pu1_dst[i] = pu1_ref_u;
+ pu1_dst[i + 1] = pu1_ref_v;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T8C_4NT)+1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Divide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T8C_4NT / 2))
+ {
+ get_bits = GET_BIT(nbr_flags_temp, 8);
+
+ /* only pel substitution for TL */
+ if(!get_bits)
+ {
+ pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+ pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+ }
+ }
+ else
+ {
+ get_bits = GET_BIT(nbr_flags_temp, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ UWORD16 *pu2_dst;
+ /* 8 pel substitution (other than TL) */
+ pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+ ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT / 2)) ? 1 : 4;
+ }
+
+ }
+ else if(nt == 16)
+ {
+ /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 4 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4; /* for bottom left and left */
+
+ if(nbr_id_from_bl == 32)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags >> 16) & 0x1))
+ {
+ /* top left not available */
+ nbr_id_from_bl++;
+ /* top and top right; 4 pels per nbr bit */
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+ pu1_ref_v = pu1_dst[2 * nbr_id_from_bl + 1];
+ for(i = (2 * (nbr_id_from_bl - 1)); i >= 0; i -= 2)
+ {
+ pu1_dst[i] = pu1_ref_u;
+ pu1_dst[i + 1] = pu1_ref_v;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T16C_4NT)+1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 4 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T16C_4NT / 2))
+ {
+ get_bits = GET_BIT(nbr_flags, 16);
+ /* only pel substitution for TL */
+ if(!get_bits)
+ {
+ pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+ pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+ }
+ }
+ else
+ {
+ get_bits = GET_BIT(nbr_flags, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ UWORD16 *pu2_dst;
+ /* 4 pel substitution (other than TL) */
+ pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+ ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT / 2)) ? 1 : 4;
+ }
+ }
+ }
+}
+
+
+void ihevc_intra_pred_luma_ref_substitution_a9q(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd)
+{
+ UWORD8 pu1_ref;
+ WORD32 dc_val, i;
+ WORD32 total_samples = (4 * nt) + 1;
+ WORD32 two_nt = 2 * nt;
+
+ WORD32 three_nt = 3 * nt;
+ WORD32 get_bits;
+ WORD32 next;
+ WORD32 bot_left, left, top, tp_right, tp_left;
+
+ WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+ UNUSED(dst_strd);
+ /*dc_val = 1 << (BIT_DEPTH - 1);*/
+ dc_val = 1 << (8 - 1);
+
+
+ /* Neighbor Flag Structure*/
+ /* MSB ---> LSB */
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ 1 4 4 4 4
+ */
+ /* If no neighbor flags are present, fill the neighbor samples with DC value */
+ if(nbr_flags == 0)
+ {
+ for(i = 0; i < total_samples; i++)
+ {
+ pu1_dst[i] = dc_val;
+ }
+ }
+ else
+ {
+ if(nt <= 8)
+ {
+ /* 1 bit extraction for all the neighboring blocks */
+ tp_left = (nbr_flags & 0x10000) >> 16;
+ bot_left = (nbr_flags & 0x8) >> 3;
+ left = (nbr_flags & 0x80) >> 7;
+ top = (nbr_flags & 0x100) >> 8;
+ tp_right = (nbr_flags & 0x1000) >> 12;
+
+#if 1
+ /* Else fill the corresponding samples */
+ if(tp_left)
+ pu1_dst[two_nt] = *pu1_top_left;
+ else
+ pu1_dst[two_nt] = 0;
+
+
+ if(left)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_a9q(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+ }
+
+
+ if(bot_left)
+ {
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_a9q(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+ }
+
+
+ if(top)
+ {
+ ihevc_memcpy_a9q(&pu1_dst[two_nt + 1], pu1_top, nt);
+ }
+ else
+ {
+ ihevc_memset_a9q(&pu1_dst[two_nt + 1], 0, nt);
+ }
+
+ if(tp_right)
+ {
+ ihevc_memcpy_a9q(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+ }
+ else
+ {
+ ihevc_memset_a9q(&pu1_dst[two_nt + 1 + nt], 0, nt);
+ }
+#endif
+ next = 1;
+
+ /* If bottom -left is not available, reverse substitution process*/
+ if(bot_left == 0)
+ {
+ WORD32 a_nbr_flag[5];
+ a_nbr_flag[0] = bot_left;
+ a_nbr_flag[1] = left;
+ a_nbr_flag[2] = tp_left;
+ a_nbr_flag[3] = top;
+ a_nbr_flag[4] = tp_right;
+
+ /* Check for the 1st available sample from bottom-left*/
+ while(!a_nbr_flag[next])
+ next++;
+
+ /* If Left, top-left are available*/
+ if(next <= 2)
+ {
+ idx = nt * next;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ else /* If top, top-right are available */
+ {
+ /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+ idx = (nt * (next - 1)) + 1;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* Forward Substitution Process */
+ /* If left is Unavailable, copy the last bottom-left value */
+ if(left == 0)
+ {
+ ihevc_memset_a9q(&pu1_dst[nt], pu1_dst[nt - 1], nt);
+
+ }
+ /* If top-left is Unavailable, copy the last left value */
+ if(tp_left == 0)
+ pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+ /* If top is Unavailable, copy the last top-left value */
+ if(top == 0)
+ {
+ ihevc_memset_a9q(&pu1_dst[two_nt + 1], pu1_dst[two_nt], nt);
+ }
+ /* If to right is Unavailable, copy the last top value */
+ if(tp_right == 0)
+ {
+ ihevc_memset_a9q(&pu1_dst[three_nt + 1], pu1_dst[three_nt], nt);
+
+ }
+ }
+
+ if(nt == 16)
+ {
+ WORD32 nbr_flags_temp = 0;
+ nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+ + ((nbr_flags & 0x300) >> 4)
+ + ((nbr_flags & 0x3000) >> 6)
+ + ((nbr_flags & 0x10000) >> 8);
+
+#if 1
+ /* Else fill the corresponding samples */
+ if(nbr_flags & 0x10000)
+ pu1_dst[two_nt] = *pu1_top_left;
+ else
+ pu1_dst[two_nt] = 0;
+
+ if(nbr_flags & 0xC0)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+ }
+
+ if(nbr_flags & 0xC)
+ {
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+ }
+
+
+ if(nbr_flags & 0x300)
+ {
+ ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1], pu1_top, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1], 0, nt);
+ }
+
+ if(nbr_flags & 0x3000)
+ {
+ ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], 0, nt);
+ }
+#endif
+ /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ nbr_id_from_bl = 32;
+
+ if(nbr_id_from_bl == 32)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags_temp >> 8) & 0x1))
+ {
+ nbr_id_from_bl++;
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
+ //nbr_id_from_bl += idx * 8;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ {
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T16_4NT) + 1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T16_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags_temp, 8);
+
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ ihevc_memset_mul_8_a9q(pu1_dst + nbr_id_from_bl, pu1_ref, 8);
+
+
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+ }
+
+
+ }
+
+ if(nt == 32)
+ {
+#if 1
+ /* Else fill the corresponding samples */
+ if(nbr_flags & 0x10000)
+ pu1_dst[two_nt] = *pu1_top_left;
+ else
+ pu1_dst[two_nt] = 0;
+
+ if(nbr_flags & 0xF0)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+ }
+
+ if(nbr_flags & 0xF)
+ {
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+ }
+
+
+ if(nbr_flags & 0xF00)
+ {
+ ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1], pu1_top, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1], 0, nt);
+ }
+
+ if(nbr_flags & 0xF000)
+ {
+ ihevc_memcpy_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8_a9q(&pu1_dst[two_nt + 1 + nt], 0, nt);
+ }
+#endif
+ /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags >> 16) & 0x1))
+ {
+ /* top left not available */
+ nbr_id_from_bl++;
+ /* top and top right; 8 pels per nbr bit */
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T32_4NT) + 1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T32_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags, 16);
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ ihevc_memset_mul_8_a9q(&pu1_dst[nbr_id_from_bl], pu1_ref, 8);
+
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+ }
+ }
+
+ }
+}
diff --git a/common/arm/ihevc_itrans_recon_16x16.s b/common/arm/ihevc_itrans_recon_16x16.s
new file mode 100644
index 0000000..82055ad
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_16x16.s
@@ -0,0 +1,1141 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ihevc_itrans_recon_8x8_neon.s
+@ *
+@ * @brief
+@ * contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * anand s
+@ *
+@ * @par list of functions:
+@ * - ihevc_itrans_recon_16x16()
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * this function performs inverse transform and reconstruction for 8x8
+@ * input block
+@ *
+@ * @par description:
+@ * performs inverse transform and adds the prediction data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ * input 16x16 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ * temporary 16x16 buffer for storing inverse
+@ *
+@ * transform
+@ * 1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ * prediction 16x16 block
+@ *
+@ * @param[out] pu1_dst
+@ * output 8x8 block
+@ *
+@ * @param[in] src_strd
+@ * input stride
+@ *
+@ * @param[in] pred_strd
+@ * prediction stride
+@ *
+@ * @param[in] dst_strd
+@ * output stride
+@ *
+@ * @param[in] shift
+@ * output shift
+@ *
+@ * @param[in] r12
+@ * zero columns in pi2_src
+@ *
+@ * @returns void
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@ */
+
+@void ihevc_itrans_recon_16x16(word16 *pi2_src,
+@ word16 *pi2_tmp,
+@ uword8 *pu1_pred,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 pred_strd,
+@ word32 dst_strd,
+@ word32 r12
+@ word32 r11 )
+
+@**************variables vs registers*************************
+@ r0 => *pi2_src
+@ r1 => *pi2_tmp
+@ r2 => *pu1_pred
+@ r3 => *pu1_dst
+@ src_strd
+@ pred_strd
+@ dst_strd
+@ r12
+@ r11
+
+.text
+.align 4
+
+
+
+
+
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+@#define zero_cols r12
+@#define zero_rows r11
+.globl ihevc_itrans_recon_16x16_a9q
+
+.extern g_ai2_ihevc_trans_16_transpose
+
+g_ai2_ihevc_trans_16_transpose_addr:
+.long g_ai2_ihevc_trans_16_transpose - ulbl1 - 8
+
+.type ihevc_itrans_recon_16x16_a9q, %function
+
+ihevc_itrans_recon_16x16_a9q:
+
+ stmfd sp!,{r4-r12,lr}
+@ add sp,sp,#40
+
+
+
+@ ldr r8,[sp,#4] @ prediction stride
+@ ldr r7,[sp,#8] @ destination stride
+ ldr r6,[sp,#40] @ src stride
+ ldr r12,[sp,#52]
+ ldr r11,[sp,#56]
+
+
+
+ ldr r14,g_ai2_ihevc_trans_16_transpose_addr
+ulbl1:
+ add r14,r14,pc
+ vld1.16 {d0,d1,d2,d3},[r14] @//d0,d1 are used for storing the constant data
+ movw r7,#0xffff
+ and r12,r12,r7
+ and r11,r11,r7
+ mov r6,r6,lsl #1 @ x sizeof(word16)
+ add r9,r0,r6, lsl #1 @ 2 rows
+
+ add r10,r6,r6, lsl #1 @ 3 rows
+ add r5,r6,r6,lsl #2
+ movw r7,#0xfff0
+
+ cmp r12,r7
+ bge zero_12cols_decision
+
+ cmp r12,#0xff00
+ bge zero_8cols_decision
+
+
+
+
+ mov r14,#4
+ cmp r11,r7
+ rsbge r10,r6,#0
+
+ cmp r11,#0xff00
+ movge r8,r5
+ rsbge r8,r8,#0
+ movlt r8,r10
+ add r5,r5,r6,lsl #3
+ rsb r5,r5,#0
+
+ b first_stage_top_four_bottom_four
+
+zero_12cols_decision:
+ mov r14,#1
+ cmp r11,#0xff00
+ movge r8,r5
+ movlt r8,r10
+ add r5,r5,r6,lsl #3
+ rsb r5,r5,#0
+
+ b first_stage_top_four_bottom_four
+
+zero_8cols_decision:
+ mov r14,#2
+ mov r8,r5
+ rsb r8,r8,#0
+ cmp r11,#0xff00
+ movlt r8,r10
+ add r5,r5,r6,lsl #3
+ rsb r5,r5,#0
+ cmp r11,r7
+ rsbge r10,r6,#0
+
+
+ b first_stage_top_four_bottom_four
+
+
+@d0[0]= 64 d2[0]=64
+@d0[1]= 90 d2[1]=57
+@d0[2]= 89 d2[2]=50
+@d0[3]= 87 d2[3]=43
+@d1[0]= 83 d3[0]=36
+@d1[1]= 80 d3[1]=25
+@d1[2]= 75 d3[2]=18
+@d1[3]= 70 d3[3]=9
+
+
+
+first_stage:
+ add r0,r0,#8
+ add r9,r9,#8
+
+first_stage_top_four_bottom_four:
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d11,[r9],r6
+ vld1.16 d6,[r0],r10
+ vld1.16 d7,[r9],r10
+ cmp r11,r7
+ bge skip_load4rows
+
+ vld1.16 d4,[r0],r6
+ vld1.16 d5,[r9],r6
+ vld1.16 d8,[r0],r8
+ vld1.16 d9,[r9],r8
+
+@ registers used: q0,q1,q3,q5,q2,q4
+
+@ d10 =r0
+@d6= r1
+@d11=r2
+@d7=r3
+
+skip_load4rows:
+ vmull.s16 q12,d6,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d6,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d7,d2[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d7,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d2[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+ vmull.s16 q6,d10,d0[0]
+ vmlal.s16 q6,d11,d0[2]
+ vmull.s16 q7,d10,d0[0]
+ vmlal.s16 q7,d11,d1[2]
+ vmull.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d2[2]
+ vmull.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d3[2]
+
+ bge skip_last12rows_kernel1
+
+
+ vmlal.s16 q12,d8,d1[1]
+ vmlal.s16 q13,d8,d3[3]
+ vmlsl.s16 q14,d8,d1[3]
+ vmlsl.s16 q15,d8,d0[3]
+
+
+ vmlal.s16 q12,d9,d1[3]
+ vmlsl.s16 q13,d9,d2[3]
+ vmlsl.s16 q14,d9,d0[3]
+ vmlal.s16 q15,d9,d3[3]
+
+
+
+
+
+ vmlal.s16 q6,d4,d1[0]
+ vmlal.s16 q6,d5,d1[2]
+ vmlal.s16 q7,d4,d3[0]
+ vmlsl.s16 q7,d5,d3[2]
+ vmlsl.s16 q8,d4,d3[0]
+ vmlsl.s16 q8,d5,d0[2]
+ vmlsl.s16 q9,d4,d1[0]
+ vmlsl.s16 q9,d5,d2[2]
+
+@d0[0]= 64 d2[0]=64
+@d0[1]= 90 d2[1]=57
+@d0[2]= 89 d2[2]=50
+@d0[3]= 87 d2[3]=43
+@d1[0]= 83 d3[0]=36
+@d1[1]= 80 d3[1]=25
+@d1[2]= 75 d3[2]=18
+@d1[3]= 70 d3[3]=9
+ cmp r11,#0xff00
+ bge skip_last12rows_kernel1
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d11,[r9],r6
+ vld1.16 d6,[r0],r10
+ vld1.16 d7,[r9],r10
+ vld1.16 d4,[r0],r6
+ vld1.16 d5,[r9],r6
+ vld1.16 d8,[r0],r5
+ vld1.16 d9,[r9],r5
+
+
+
+
+ vmlal.s16 q12,d6,d2[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d6,d1[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d6,d3[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d6,d0[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d7,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d7,d0[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d7,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d7,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ vmlal.s16 q12,d8,d3[1]
+ vmlsl.s16 q13,d8,d1[3]
+ vmlal.s16 q14,d8,d0[1]
+ vmlsl.s16 q15,d8,d1[1]
+
+
+ vmlal.s16 q12,d9,d3[3]
+ vmlsl.s16 q13,d9,d3[1]
+ vmlal.s16 q14,d9,d2[3]
+ vmlsl.s16 q15,d9,d2[1]
+
+
+
+
+
+ vmlal.s16 q6,d10,d0[0]
+ vmlal.s16 q6,d11,d2[2]
+ vmlal.s16 q6,d4,d3[0]
+ vmlal.s16 q6,d5,d3[2]
+
+
+
+
+ vmlsl.s16 q7,d10,d0[0]
+ vmlsl.s16 q7,d11,d0[2]
+ vmlsl.s16 q7,d4,d1[0]
+ vmlsl.s16 q7,d5,d2[2]
+
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d3[2]
+ vmlal.s16 q8,d4,d1[0]
+ vmlal.s16 q8,d5,d1[2]
+
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d1[2]
+ vmlsl.s16 q9,d4,d3[0]
+ vmlsl.s16 q9,d5,d0[2]
+
+skip_last12rows_kernel1:
+ vadd.s32 q10,q6,q12
+ vsub.s32 q11,q6,q12
+
+ vadd.s32 q6,q7,q13
+ vsub.s32 q12,q7,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+
+
+
+
+
+ vqrshrn.s32 d30,q10,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q11,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q7,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d18,q13,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d12,q6,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q12,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d13,q8,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q14,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ vst1.16 {d30,d31},[r1]!
+ vst1.16 {d18,d19},[r1]!
+ sub r1,r1,#32
+
+ bge skip_stage1_kernel_load
+
+first_stage_middle_eight:
+
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d11,[r9],r6
+ vld1.16 d6,[r0],r10
+ vld1.16 d7,[r9],r10
+ vld1.16 d4,[r0],r6
+ vld1.16 d5,[r9],r6
+ vld1.16 d8,[r0],r8
+ vld1.16 d9,[r9],r8
+
+
+skip_stage1_kernel_load:
+ vmull.s16 q12,d6,d2[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d6,d2[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d6,d3[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d6,d3[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d7,d1[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d7,d0[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d7,d1[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d3[2]
+ vmull.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d2[2]
+ vmull.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d1[2]
+ vmull.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d0[2]
+
+
+ cmp r11,r7
+ bge skip_last12rows_kernel2
+
+ vmlsl.s16 q12,d8,d3[1]
+ vmlal.s16 q13,d8,d2[1]
+ vmlal.s16 q14,d8,d0[1]
+ vmlal.s16 q15,d8,d2[3]
+
+
+ vmlal.s16 q12,d9,d0[1]
+ vmlal.s16 q13,d9,d3[1]
+ vmlsl.s16 q14,d9,d1[1]
+ vmlsl.s16 q15,d9,d2[1]
+
+
+
+ vmlsl.s16 q11,d4,d1[0]
+ vmlal.s16 q11,d5,d2[2]
+ vmlsl.s16 q10,d4,d3[0]
+ vmlal.s16 q10,d5,d0[2]
+ vmlal.s16 q8,d4,d3[0]
+ vmlal.s16 q8,d5,d3[2]
+ vmlal.s16 q9,d4,d1[0]
+ vmlsl.s16 q9,d5,d1[2]
+
+@d0[0]= 64 d2[0]=64
+@d0[1]= 90 d2[1]=57
+@d0[2]= 89 d2[2]=50
+@d0[3]= 87 d2[3]=43
+@d1[0]= 83 d3[0]=36
+@d1[1]= 80 d3[1]=25
+@d1[2]= 75 d3[2]=18
+@d1[3]= 70 d3[3]=9
+ cmp r11,#0xff00
+ bge skip_last12rows_kernel2
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d11,[r9],r6
+ vld1.16 d6,[r0],r10
+ vld1.16 d7,[r9],r10
+ vld1.16 d4,[r0],r6
+ vld1.16 d5,[r9],r6
+ vld1.16 d8,[r0],r5
+ vld1.16 d9,[r9],r5
+
+
+ vmlsl.s16 q12,d6,d3[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d6,d2[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d7,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d7,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ vmlal.s16 q12,d8,d2[3]
+ vmlal.s16 q13,d8,d3[3]
+ vmlsl.s16 q14,d8,d2[1]
+ vmlal.s16 q15,d8,d0[3]
+
+
+ vmlal.s16 q12,d9,d1[3]
+ vmlsl.s16 q13,d9,d1[1]
+ vmlal.s16 q14,d9,d0[3]
+ vmlsl.s16 q15,d9,d0[1]
+
+
+
+
+ vmlal.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d1[2]
+ vmlsl.s16 q11,d4,d3[0]
+ vmlal.s16 q11,d5,d0[2]
+
+
+
+ vmlsl.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d3[2]
+ vmlal.s16 q10,d4,d1[0]
+ vmlsl.s16 q10,d5,d1[2]
+
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d0[2]
+ vmlsl.s16 q8,d4,d1[0]
+ vmlal.s16 q8,d5,d2[2]
+
+
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d2[2]
+ vmlal.s16 q9,d4,d3[0]
+ vmlsl.s16 q9,d5,d3[2]
+
+skip_last12rows_kernel2:
+
+ vadd.s32 q2,q11,q12
+ vsub.s32 q11,q11,q12
+
+ vadd.s32 q3,q10,q13
+ vsub.s32 q12,q10,q13
+
+ vadd.s32 q5,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d18,q2,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q11,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q5,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d30,q13,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d20,q3,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d23,q12,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d21,q8,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d22,q14,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+ @ registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+
+
+
+
+
+ vld1.16 {d4,d5},[r1]!
+ vld1.16 {d8,d9},[r1]!
+ sub r1,r1,#32
+
+@d4=r0
+@d12=r1
+@d5=r2
+@d13=r3
+
+@d18=r4
+@d20=r5
+@d19=r6
+@d21=r7
+
+@d22=r8
+@d30=r9
+@d23=r10
+@d31=r11
+
+@d14=r12
+@d8=r13
+@d15=r14
+@d9=r15
+
+
+ vtrn.16 q2,q6
+ vtrn.16 q9,q10
+ vtrn.16 q11,q15
+ vtrn.16 q7,q4
+
+
+
+ vtrn.32 d4,d5
+ vtrn.32 d12,d13
+
+ vtrn.32 d18,d19
+ vtrn.32 d20,d21
+
+ vtrn.32 d22,d23
+ vtrn.32 d30,d31
+
+ vtrn.32 d14,d15
+ vtrn.32 d8,d9
+
+
+@ d4 =r0 1- 4 values
+@ d5 =r2 1- 4 values
+@ d12=r1 1- 4 values
+@ d13=r3 1- 4 values
+
+@ d18 =r0 5- 8 values
+@ d19 =r2 5- 8 values
+@ d20=r1 5- 8 values
+@ d21=r3 5- 8 values
+
+@ d22 =r0 9- 12 values
+@ d23 =r2 9- 12 values
+@ d30=r1 9- 12 values
+@ d31=r3 9- 12 values
+
+@ d14 =r0 13-16 values
+@ d15 =r2 13- 16 values
+@ d8=r1 13- 16 values
+@ d9=r3 13- 16 values
+
+
+ vst1.16 {q2},[r1]!
+ vst1.16 {q6},[r1]!
+
+ vst1.16 {q9},[r1]!
+ vst1.16 {q10},[r1]!
+ vst1.16 {q11},[r1]!
+ vst1.16 {q15},[r1]!
+ vst1.16 {q7},[r1]!
+ vst1.16 {q4},[r1]!
+
+
+ subs r14,r14,#1
+ bne first_stage
+
+
+
+
+
+
+
+
+
+
+ mov r6,r7
+
+ ldr r8,[sp,#44] @ prediction stride
+ ldr r7,[sp,#48] @ destination stride
+
+ mov r10,#16
+
+ cmp r12,r6
+ subge r1,r1,#128
+ bge label1
+
+ cmp r12,#0xff00
+ subge r1,r1,#256
+ bge label_2
+
+ sub r1,r1,#512
+ rsb r10,r10,#0
+
+label_2:
+ add r9,r1,#128
+ add r11,r9,#128
+ add r0,r11,#128
+
+
+
+label1:
+@ mov r6,r1
+
+
+ mov r14,#4
+ add r4,r2,r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
+ add r5,r8,r8, lsl #1 @
+@ add r0,r3,r7, lsl #1 @ r0 points to 3rd row of dest data
+@ add r10,r7,r7, lsl #1 @
+
+
+
+
+second_stage:
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d6,d7},[r1],r10
+ cmp r12,r6
+ bge second_stage_process
+ vld1.16 {d4,d5},[r9]!
+ vld1.16 {d8,d9},[r9],r10
+
+second_stage_process:
+
+
+ vmull.s16 q12,d6,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d6,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d7,d2[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d7,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d2[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ vmull.s16 q6,d10,d0[0]
+ vmlal.s16 q6,d11,d0[2]
+ vmull.s16 q7,d10,d0[0]
+ vmlal.s16 q7,d11,d1[2]
+ vmull.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d2[2]
+ vmull.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d3[2]
+
+ bge skip_last8rows_stage2_kernel1
+
+ vmlal.s16 q12,d8,d1[1]
+ vmlal.s16 q13,d8,d3[3]
+ vmlsl.s16 q14,d8,d1[3]
+ vmlsl.s16 q15,d8,d0[3]
+
+
+ vmlal.s16 q12,d9,d1[3]
+ vmlsl.s16 q13,d9,d2[3]
+ vmlsl.s16 q14,d9,d0[3]
+ vmlal.s16 q15,d9,d3[3]
+
+
+ vmlal.s16 q6,d4,d1[0]
+ vmlal.s16 q6,d5,d1[2]
+ vmlal.s16 q7,d4,d3[0]
+ vmlsl.s16 q7,d5,d3[2]
+ vmlsl.s16 q8,d4,d3[0]
+ vmlsl.s16 q8,d5,d0[2]
+ vmlsl.s16 q9,d4,d1[0]
+ vmlsl.s16 q9,d5,d2[2]
+
+ cmp r12,#0xff00
+ bge skip_last8rows_stage2_kernel1
+
+
+ vld1.16 {d10,d11},[r11]!
+ vld1.16 {d6,d7},[r11],r10
+ vld1.16 {d4,d5},[r0]!
+ vld1.16 {d8,d9},[r0],r10
+
+
+
+
+
+ vmlal.s16 q12,d6,d2[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d6,d1[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d6,d3[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d6,d0[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d7,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d7,d0[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d7,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d7,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ vmlal.s16 q12,d8,d3[1]
+ vmlsl.s16 q13,d8,d1[3]
+ vmlal.s16 q14,d8,d0[1]
+ vmlsl.s16 q15,d8,d1[1]
+
+
+ vmlal.s16 q12,d9,d3[3]
+ vmlsl.s16 q13,d9,d3[1]
+ vmlal.s16 q14,d9,d2[3]
+ vmlsl.s16 q15,d9,d2[1]
+
+
+
+
+
+ vmlal.s16 q6,d10,d0[0]
+ vmlal.s16 q6,d11,d2[2]
+ vmlal.s16 q6,d4,d3[0]
+ vmlal.s16 q6,d5,d3[2]
+
+
+
+
+ vmlsl.s16 q7,d10,d0[0]
+ vmlsl.s16 q7,d11,d0[2]
+ vmlsl.s16 q7,d4,d1[0]
+ vmlsl.s16 q7,d5,d2[2]
+
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d3[2]
+ vmlal.s16 q8,d4,d1[0]
+ vmlal.s16 q8,d5,d1[2]
+
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d1[2]
+ vmlsl.s16 q9,d4,d3[0]
+ vmlsl.s16 q9,d5,d0[2]
+
+
+
+
+
+
+skip_last8rows_stage2_kernel1:
+
+
+
+ vadd.s32 q10,q6,q12
+ vsub.s32 q11,q6,q12
+
+ vadd.s32 q6,q7,q13
+ vsub.s32 q12,q7,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+
+
+
+
+
+ vqrshrn.s32 d30,q10,#shift_stage2_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q11,#shift_stage2_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q7,#shift_stage2_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d18,q13,#shift_stage2_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d12,q6,#shift_stage2_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q12,#shift_stage2_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d13,q8,#shift_stage2_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q14,#shift_stage2_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ bge skip_stage2_kernel_load
+
+ @q2,q4,q6,q7 is used
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d6,d7},[r1]!
+ vld1.16 {d4,d5},[r9]!
+ vld1.16 {d8,d9},[r9]!
+skip_stage2_kernel_load:
+ sub r1,r1,#32
+ vst1.16 {d30,d31},[r1]!
+ vst1.16 {d18,d19},[r1]!
+ sub r1,r1,#32
+
+ vmull.s16 q12,d6,d2[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d6,d2[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d6,d3[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d6,d3[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d7,d1[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d7,d0[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d7,d1[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d3[2]
+ vmull.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d2[2]
+ vmull.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d1[2]
+ vmull.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d0[2]
+
+
+
+ cmp r12,r6
+ bge skip_last8rows_stage2_kernel2
+
+
+ vmlsl.s16 q12,d8,d3[1]
+ vmlal.s16 q13,d8,d2[1]
+ vmlal.s16 q14,d8,d0[1]
+ vmlal.s16 q15,d8,d2[3]
+
+
+ vmlal.s16 q12,d9,d0[1]
+ vmlal.s16 q13,d9,d3[1]
+ vmlsl.s16 q14,d9,d1[1]
+ vmlsl.s16 q15,d9,d2[1]
+
+
+
+ vmlsl.s16 q11,d4,d1[0]
+ vmlal.s16 q11,d5,d2[2]
+ vmlsl.s16 q10,d4,d3[0]
+ vmlal.s16 q10,d5,d0[2]
+ vmlal.s16 q8,d4,d3[0]
+ vmlal.s16 q8,d5,d3[2]
+ vmlal.s16 q9,d4,d1[0]
+ vmlsl.s16 q9,d5,d1[2]
+ cmp r12,#0xff00
+ bge skip_last8rows_stage2_kernel2
+
+ vld1.16 {d10,d11},[r11]!
+ vld1.16 {d6,d7},[r11]!
+ vld1.16 {d4,d5},[r0]!
+ vld1.16 {d8,d9},[r0]!
+
+ vmlsl.s16 q12,d6,d3[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d6,d2[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d7,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d7,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ vmlal.s16 q12,d8,d2[3]
+ vmlal.s16 q13,d8,d3[3]
+ vmlsl.s16 q14,d8,d2[1]
+ vmlal.s16 q15,d8,d0[3]
+
+
+ vmlal.s16 q12,d9,d1[3]
+ vmlsl.s16 q13,d9,d1[1]
+ vmlal.s16 q14,d9,d0[3]
+ vmlsl.s16 q15,d9,d0[1]
+
+
+
+
+ vmlal.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d1[2]
+ vmlsl.s16 q11,d4,d3[0]
+ vmlal.s16 q11,d5,d0[2]
+
+
+
+ vmlsl.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d3[2]
+ vmlal.s16 q10,d4,d1[0]
+ vmlsl.s16 q10,d5,d1[2]
+
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d0[2]
+ vmlsl.s16 q8,d4,d1[0]
+ vmlal.s16 q8,d5,d2[2]
+
+
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d2[2]
+ vmlal.s16 q9,d4,d3[0]
+ vmlsl.s16 q9,d5,d3[2]
+
+
+skip_last8rows_stage2_kernel2:
+
+
+
+ vadd.s32 q2,q11,q12
+ vsub.s32 q11,q11,q12
+
+ vadd.s32 q3,q10,q13
+ vsub.s32 q12,q10,q13
+
+ vadd.s32 q5,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d18,q2,#shift_stage2_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q11,#shift_stage2_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q5,#shift_stage2_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d30,q13,#shift_stage2_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d20,q3,#shift_stage2_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d23,q12,#shift_stage2_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d21,q8,#shift_stage2_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d22,q14,#shift_stage2_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ vld1.16 {d4,d5},[r1]!
+ vld1.16 {d8,d9},[r1]!
+
+
+
+ @ registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+@d4=r0
+@d12=r1
+@d5=r2
+@d13=r3
+
+@d18=r4
+@d20=r5
+@d19=r6
+@d21=r7
+
+@d22=r8
+@d30=r9
+@d23=r10
+@d31=r11
+
+@d14=r12
+@d8=r13
+@d15=r14
+@d9=r15
+
+
+ vtrn.16 q2,q6
+ vtrn.16 q9,q10
+ vtrn.16 q11,q15
+ vtrn.16 q7,q4
+
+
+
+ vtrn.32 d4,d5
+ vtrn.32 d12,d13
+
+ vtrn.32 d18,d19
+ vtrn.32 d20,d21
+
+ vtrn.32 d22,d23
+ vtrn.32 d30,d31
+
+ vtrn.32 d14,d15
+ vtrn.32 d8,d9
+
+@ d4 =r0 1- 4 values
+@ d5 =r2 1- 4 values
+@ d12=r1 1- 4 values
+@ d13=r3 1- 4 values
+
+@ d18 =r0 5- 8 values
+@ d19 =r2 5- 8 values
+@ d20=r1 5- 8 values
+@ d21=r3 5- 8 values
+
+@ d22 =r0 9- 12 values
+@ d23 =r2 9- 12 values
+@ d30=r1 9- 12 values
+@ d31=r3 9- 12 values
+
+@ d14 =r0 13-16 values
+@ d15 =r2 13- 16 values
+@ d8=r1 13- 16 values
+@ d9=r3 13- 16 values
+
+
+ vswp d5,d18
+ vswp d23,d14
+ vswp d13,d20
+ vswp d31,d8
+
+@ q2: r0 1-8 values
+@ q11: r0 9-16 values
+@ q9 : r2 1-8 values
+@ q7 : r2 9-16 values
+@ q6 : r1 1- 8 values
+@ q10: r3 1-8 values
+@ q15: r1 9-16 values
+@ q4: r3 9-16 values
+
+
+@ registers free: q8,q14,q12,q13
+
+
+ vld1.8 {d16,d17},[r2],r8
+ vld1.8 {d28,d29},[r2],r5
+ vld1.8 {d24,d25},[r4],r8
+ vld1.8 {d26,d27},[r4],r5
+
+
+
+
+ vaddw.u8 q2,q2,d16
+ vaddw.u8 q11,q11,d17
+ vaddw.u8 q6,q6,d28
+ vaddw.u8 q15,q15,d29
+ vaddw.u8 q9,q9,d24
+ vaddw.u8 q7,q7,d25
+ vaddw.u8 q10,q10,d26
+ vaddw.u8 q4,q4,d27
+
+
+ vqmovun.s16 d16,q2
+ vqmovun.s16 d17,q11
+ vqmovun.s16 d28,q6
+ vqmovun.s16 d29,q15
+ vqmovun.s16 d24,q9
+ vqmovun.s16 d25,q7
+ vqmovun.s16 d26,q10
+ vqmovun.s16 d27,q4
+
+
+
+ vst1.8 {d16,d17},[r3],r7
+ vst1.8 {d28,d29},[r3],r7
+ vst1.8 {d24,d25},[r3],r7
+ vst1.8 {d26,d27},[r3],r7
+
+ subs r14,r14,#1
+
+
+
+ bne second_stage
+
+
+@ sub sp,sp,#40
+ ldmfd sp!,{r4-r12,pc}
+
+
+
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_itrans_recon_32x32.s b/common/arm/ihevc_itrans_recon_32x32.s
new file mode 100644
index 0000000..eeb1d66
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_32x32.s
@@ -0,0 +1,2863 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ihevc_itrans_recon_8x8_neon.s
+@ *
+@ * @brief
+@ * contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * anand s
+@ *
+@ * @par list of functions:
+@ * - ihevc_itrans_recon_32x32()
+@ *
+@ * @remarks
+@ * the input buffer is being corrupted
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * this function performs inverse transform and reconstruction for 8x8
+@ * input block
+@ *
+@ * @par description:
+@ * performs inverse transform and adds the prediction data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ * input 16x16 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ * temporary 16x16 buffer for storing inverse
+@ *
+@ * transform
+@ * 1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ * prediction 16x16 block
+@ *
+@ * @param[out] pu1_dst
+@ * output 8x8 block
+@ *
+@ * @param[in] src_strd
+@ * input stride
+@ *
+@ * @param[in] pred_strd
+@ * prediction stride
+@ *
+@ * @param[in] dst_strd
+@ * output stride
+@ *
+@ * @param[in] shift
+@ * output shift
+@ *
+@ * @param[in] r12
+@ * zero columns in pi2_src
+@ *
+@ * @returns void
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@ */
+
+@void ihevc_itrans_recon_32x32(word16 *pi2_src,
+@ word16 *pi2_tmp,
+@ uword8 *pu1_pred,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 pred_strd,
+@ word32 dst_strd,
+@ word32 r12
+@ word32 r11 )
+
+@**************variables vs registers*************************
+@ r0 => *pi2_src
+@ r1 => *pi2_tmp
+@ r2 => *pu1_pred
+@ r3 => *pu1_dst
+@ src_strd
+@ pred_strd
+@ dst_strd
+@ r12
+@ r11
+
+
+@d0[0]= 64 d2[0]=83
+@d0[1]= 90 d2[1]=82
+@d0[2]= 90 d2[2]=80
+@d0[3]= 90 d2[3]=78
+@d1[0]= 89 d3[0]=75
+@d1[1]= 88 d3[1]=73
+@d1[2]= 87 d3[2]=70
+@d1[3]= 85 d3[3]=67
+
+@d4[0]= 64 d6[0]=36
+@d4[1]= 61 d6[1]=31
+@d4[2]= 57 d6[2]=25
+@d4[3]= 54 d6[3]=22
+@d5[0]= 50 d7[0]=18
+@d5[1]= 46 d7[1]=13
+@d5[2]= 43 d7[2]=9
+@d5[3]= 38 d7[3]=4
+
+.text
+.align 4
+
+
+
+
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+@#define zero_cols r12
+@#define zero_rows r11
+
+.globl ihevc_itrans_recon_32x32_a9q
+
+.extern g_ai2_ihevc_trans_32_transpose
+
+g_ai2_ihevc_trans_32_transpose_addr:
+.long g_ai2_ihevc_trans_32_transpose - ulbl1 - 8
+
+r5_addr: .word 0xfffff000
+r9_addr: .word 0xffff0000
+
+.type ihevc_itrans_recon_32x32_a9q, %function
+
+ihevc_itrans_recon_32x32_a9q:
+
+ stmfd sp!,{r0-r12,lr}
+
+
+@ldr r8,[sp,#56] @ prediction stride
+@ldr r7,[sp,#64] @ destination stride
+ ldr r6,[sp,#56] @ src stride
+ ldr r12,[sp,#68]
+ ldr r11,[sp,#72]
+ mov r6,r6,lsl #1 @ x sizeof(word16)
+ add r10,r6,r6, lsl #1 @ 3 rows
+
+
+ mov r8,r0
+
+ ldr r14,g_ai2_ihevc_trans_32_transpose_addr
+ulbl1:
+ add r14,r14,pc
+ vld1.16 {d0,d1,d2,d3},[r14]!
+ vld1.16 {d4,d5,d6,d7},[r14]!
+
+@registers which are free
+@ r10,r9,r11,r12
+ mov r9,#0xffffff00
+ mov r10,#0xfffffff0
+ ldr r5,r5_addr
+ ldr r7,r9_addr
+ cmp r12,r10
+ movhs r14,#1
+ bhs stage1
+
+
+ cmp r12,r9
+ movhs r14,#2
+ bhs stage1
+
+ cmp r12,r5
+ movhs r14,#3
+ bhs stage1
+
+ cmp r12,r7
+ movhs r14,#4
+
+ mov r14,#8
+ b stage1
+@.ltorg
+
+
+dct_stage1:
+ add r8,r8,#8
+ mov r0,r8
+
+stage1:
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+ vmull.s16 q12,d8,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d2[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d5[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlal.s16 q10,d11,d0[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlal.s16 q11,d11,d1[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d2[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d3[2]
+ cmp r11,r10
+ bhs shift1
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+
+
+ vmlal.s16 q12,d14,d1[1]
+ vmlal.s16 q13,d14,d3[3]
+ vmlal.s16 q14,d14,d6[1]
+ vmlsl.s16 q15,d14,d7[1]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlal.s16 q13,d15,d5[1]
+ vmlsl.s16 q14,d15,d7[1]
+ vmlsl.s16 q15,d15,d3[3]
+
+
+ vmlal.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d1[2]
+ vmlal.s16 q11,d12,d3[0]
+ vmlal.s16 q11,d13,d4[2]
+ vmlal.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d7[2]
+ vmlal.s16 q9,d12,d7[0]
+ vmlsl.s16 q9,d13,d5[2]
+
+ cmp r11,r9
+ bhs shift1
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+ vmlal.s16 q12,d8,d2[1] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d6[3] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d4[3] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d0[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d7[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d2[0]
+ vmlal.s16 q10,d11,d2[2]
+
+
+ vmlal.s16 q11,d10,d6[0]
+ vmlal.s16 q11,d11,d7[2]
+
+ vmlsl.s16 q8,d10,d6[0]
+ vmlsl.s16 q8,d11,d3[2]
+
+ vmlsl.s16 q9,d10,d2[0]
+ vmlsl.s16 q9,d11,d1[2]
+
+ cmp r11,r5
+ bhs shift1
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+
+
+
+
+ vmlal.s16 q12,d14,d3[1]
+ vmlsl.s16 q13,d14,d6[1]
+ vmlsl.s16 q14,d14,d0[1]
+ vmlsl.s16 q15,d14,d6[3]
+
+
+ vmlal.s16 q12,d15,d3[3]
+ vmlsl.s16 q13,d15,d4[3]
+ vmlsl.s16 q14,d15,d2[3]
+ vmlal.s16 q15,d15,d5[3]
+
+
+ vmlal.s16 q10,d12,d3[0]
+ vmlal.s16 q10,d13,d3[2]
+ vmlsl.s16 q11,d12,d7[0]
+ vmlsl.s16 q11,d13,d5[2]
+ vmlsl.s16 q8,d12,d1[0]
+ vmlsl.s16 q8,d13,d1[2]
+ vmlsl.s16 q9,d12,d5[0]
+ vmlal.s16 q9,d13,d7[2]
+
+ cmp r11,r7
+ bhs shift1
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+ vmlal.s16 q12,d8,d4[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d3[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d5[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d2[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d4[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d7[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlal.s16 q10,d11,d4[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d2[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d6[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d0[2]
+
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+ vmlal.s16 q12,d14,d5[1]
+ vmlsl.s16 q13,d14,d0[2]
+ vmlal.s16 q14,d14,d5[3]
+ vmlal.s16 q15,d14,d4[3]
+
+
+ vmlal.s16 q12,d15,d5[3]
+ vmlsl.s16 q13,d15,d1[1]
+ vmlal.s16 q14,d15,d3[1]
+ vmlsl.s16 q15,d15,d7[3]
+
+
+ vmlal.s16 q10,d12,d5[0]
+ vmlal.s16 q10,d13,d5[2]
+ vmlsl.s16 q11,d12,d1[0]
+ vmlsl.s16 q11,d13,d0[2]
+ vmlal.s16 q8,d12,d7[0]
+ vmlal.s16 q8,d13,d4[2]
+ vmlal.s16 q9,d12,d3[0]
+ vmlal.s16 q9,d13,d6[2]
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+
+
+
+
+ vmlal.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d2[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d0[1] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d4[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d6[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d1[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d0[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d6[0]
+ vmlal.s16 q10,d11,d6[2]
+
+
+ vmlsl.s16 q11,d10,d2[0]
+ vmlsl.s16 q11,d11,d3[2]
+
+ vmlal.s16 q8,d10,d2[0]
+ vmlal.s16 q8,d11,d0[2]
+
+ vmlsl.s16 q9,d10,d6[0]
+ vmlsl.s16 q9,d11,d2[2]
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+ vmlal.s16 q12,d14,d7[1]
+ vmlsl.s16 q13,d14,d5[3]
+ vmlal.s16 q14,d14,d4[1]
+ vmlsl.s16 q15,d14,d2[3]
+
+
+ vmlal.s16 q12,d15,d7[3]
+ vmlsl.s16 q13,d15,d7[1]
+ vmlal.s16 q14,d15,d6[3]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlal.s16 q10,d12,d7[0]
+ vmlal.s16 q10,d13,d7[2]
+ vmlsl.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d6[2]
+ vmlal.s16 q8,d12,d3[0]
+ vmlal.s16 q8,d13,d5[2]
+ vmlsl.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d4[2]
+
+
+
+shift1:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q5,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q7,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d18,q13,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d12,q6,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q12,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d13,q8,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q14,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+ @ registers used q15,q14,q6,q7
+
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+@ d30 =r0 1- 4 values
+@ d31 =r2 1- 4 values
+@ d12=r1 1- 4 values
+@ d13=r3 1- 4 values
+@ d14 =r0 28-31 values
+@ d15 =r2 28- 31 values
+@ d18=r1 28- 31 values
+@ d19=r3 28- 31 values
+
+
+
+ vst1.16 {q15},[r1]!
+ vst1.16 {q6},[r1]!
+ add r1,r1,#192
+ vst1.16 {q7},[r1]!
+ vst1.16 {q9},[r1]!
+ sub r1,r1,#224
+
+ mov r0,r8
+
+
+
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+
+ vmull.s16 q12,d8,d2[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d2[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d3[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d3[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d6[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d7[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d6[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d4[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlal.s16 q10,d11,d4[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlal.s16 q11,d11,d5[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d6[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d7[2]
+ cmp r11,r10
+ bhs shift2
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+ vmlsl.s16 q12,d14,d4[3]
+ vmlsl.s16 q13,d14,d2[1]
+ vmlsl.s16 q14,d14,d0[1]
+ vmlsl.s16 q15,d14,d2[3]
+
+
+ vmlsl.s16 q12,d15,d0[3]
+ vmlsl.s16 q13,d15,d3[1]
+ vmlsl.s16 q14,d15,d6[3]
+ vmlal.s16 q15,d15,d5[3]
+
+
+ vmlsl.s16 q10,d12,d7[0]
+ vmlsl.s16 q10,d13,d2[2]
+ vmlsl.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d0[2]
+ vmlsl.s16 q8,d12,d3[0]
+ vmlsl.s16 q8,d13,d3[2]
+ vmlsl.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d6[2]
+
+ cmp r11,r9
+ bhs shift2
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+
+
+
+
+ vmlsl.s16 q12,d8,d4[1] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d7[1] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d2[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d7[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d6[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d2[0]
+ vmlsl.s16 q10,d11,d6[2]
+
+
+ vmlsl.s16 q11,d10,d6[0]
+ vmlal.s16 q11,d11,d4[2]
+
+ vmlal.s16 q8,d10,d6[0]
+ vmlal.s16 q8,d11,d0[2]
+
+ vmlal.s16 q9,d10,d2[0]
+ vmlal.s16 q9,d11,d5[2]
+
+ cmp r11,r5
+ bhs shift2
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+ vmlal.s16 q12,d14,d2[3]
+ vmlal.s16 q13,d14,d3[3]
+ vmlsl.s16 q14,d14,d5[3]
+ vmlsl.s16 q15,d14,d0[3]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlsl.s16 q13,d15,d6[3]
+ vmlsl.s16 q14,d15,d0[3]
+ vmlal.s16 q15,d15,d7[3]
+
+
+ vmlal.s16 q10,d12,d5[0]
+ vmlal.s16 q10,d13,d0[2]
+ vmlal.s16 q11,d12,d1[0]
+ vmlal.s16 q11,d13,d6[2]
+ vmlal.s16 q8,d12,d7[0]
+ vmlsl.s16 q8,d13,d2[2]
+ vmlsl.s16 q9,d12,d3[0]
+ vmlsl.s16 q9,d13,d4[2]
+
+
+ cmp r11,r7
+ bhs shift2
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+
+
+
+
+ vmlal.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d1[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d7[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d0[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d5[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d7[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d7[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d1[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d5[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d3[2]
+
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+ vmlsl.s16 q12,d14,d0[1]
+ vmlal.s16 q13,d14,d6[1]
+ vmlal.s16 q14,d14,d4[1]
+ vmlsl.s16 q15,d14,d1[1]
+
+
+ vmlsl.s16 q12,d15,d3[3]
+ vmlal.s16 q13,d15,d0[1]
+ vmlsl.s16 q14,d15,d5[1]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlsl.s16 q10,d12,d3[0]
+ vmlsl.s16 q10,d13,d1[2]
+ vmlsl.s16 q11,d12,d7[0]
+ vmlal.s16 q11,d13,d3[2]
+ vmlal.s16 q8,d12,d1[0]
+ vmlal.s16 q8,d13,d7[2]
+ vmlsl.s16 q9,d12,d5[0]
+ vmlsl.s16 q9,d13,d2[2]
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+
+ vmlal.s16 q12,d8,d7[3] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d4[3] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d1[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d2[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d3[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d5[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d7[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d5[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d6[0]
+ vmlal.s16 q10,d11,d5[2]
+
+
+ vmlal.s16 q11,d10,d2[0]
+ vmlal.s16 q11,d11,d7[2]
+
+ vmlsl.s16 q8,d10,d2[0]
+ vmlsl.s16 q8,d11,d4[2]
+
+ vmlal.s16 q9,d10,d6[0]
+ vmlal.s16 q9,d11,d1[2]
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+ vmlal.s16 q12,d14,d1[1]
+ vmlsl.s16 q13,d14,d0[3]
+ vmlal.s16 q14,d14,d1[3]
+ vmlsl.s16 q15,d14,d3[1]
+
+
+ vmlal.s16 q12,d15,d5[3]
+ vmlsl.s16 q13,d15,d5[1]
+ vmlal.s16 q14,d15,d4[3]
+ vmlsl.s16 q15,d15,d4[1]
+
+
+ vmlal.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d3[2]
+ vmlsl.s16 q11,d12,d3[0]
+ vmlsl.s16 q11,d13,d2[2]
+ vmlal.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d1[2]
+ vmlsl.s16 q9,d12,d7[0]
+ vmlsl.s16 q9,d13,d0[2]
+
+shift2:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q5,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q7,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d18,q13,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d12,q6,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q12,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d13,q8,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q14,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+ vst1.16 {q15},[r1]!
+ vst1.16 {q6},[r1]!
+ add r1,r1,#128
+ vst1.16 {q7},[r1]!
+ vst1.16 {q9},[r1]!
+ sub r1,r1,#160
+ mov r0,r8
+
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+ vmull.s16 q12,d8,d4[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d4[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d5[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d5[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d3[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d0[2] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d7[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d6[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d5[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d4[2]
+
+ cmp r11,r10
+ bhs shift3
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+ vmlsl.s16 q12,d14,d5[1]
+ vmlsl.s16 q13,d14,d7[3]
+ vmlal.s16 q14,d14,d5[3]
+ vmlal.s16 q15,d14,d3[1]
+
+
+ vmlal.s16 q12,d15,d2[1]
+ vmlal.s16 q13,d15,d1[1]
+ vmlal.s16 q14,d15,d4[3]
+ vmlsl.s16 q15,d15,d7[3]
+
+
+ vmlsl.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d6[2]
+ vmlsl.s16 q11,d12,d3[0]
+ vmlal.s16 q11,d13,d3[2]
+ vmlsl.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d0[2]
+ vmlsl.s16 q9,d12,d7[0]
+ vmlal.s16 q9,d13,d2[2]
+
+ cmp r11,r9
+ bhs shift3
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+ vmlal.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d5[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d0[3] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d3[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d1[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d6[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d0[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d2[0]
+ vmlsl.s16 q10,d11,d5[2]
+
+
+ vmlal.s16 q11,d10,d6[0]
+ vmlsl.s16 q11,d11,d0[2]
+
+ vmlsl.s16 q8,d10,d6[0]
+ vmlsl.s16 q8,d11,d4[2]
+
+ vmlsl.s16 q9,d10,d2[0]
+ vmlal.s16 q9,d11,d6[2]
+
+ cmp r11,r5
+ bhs shift3
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+
+ vmlsl.s16 q12,d14,d7[1]
+ vmlal.s16 q13,d14,d2[1]
+ vmlal.s16 q14,d14,d4[1]
+ vmlsl.s16 q15,d14,d5[1]
+
+
+ vmlal.s16 q12,d15,d0[3]
+ vmlal.s16 q13,d15,d7[1]
+ vmlsl.s16 q14,d15,d1[1]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlsl.s16 q10,d12,d3[0]
+ vmlal.s16 q10,d13,d4[2]
+ vmlal.s16 q11,d12,d7[0]
+ vmlal.s16 q11,d13,d2[2]
+ vmlal.s16 q8,d12,d1[0]
+ vmlsl.s16 q8,d13,d6[2]
+ vmlal.s16 q9,d12,d5[0]
+ vmlsl.s16 q9,d13,d0[2]
+
+
+ cmp r11,r7
+ bhs shift3
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+ vmlsl.s16 q12,d8,d7[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d0[1] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d6[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d0[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d5[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d2[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d3[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d5[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d1[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d7[2]
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+ vmlal.s16 q12,d14,d6[3]
+ vmlal.s16 q13,d14,d3[3]
+ vmlsl.s16 q14,d14,d1[3]
+ vmlal.s16 q15,d14,d7[1]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlsl.s16 q13,d15,d2[3]
+ vmlal.s16 q14,d15,d7[1]
+ vmlal.s16 q15,d15,d4[1]
+
+
+ vmlsl.s16 q10,d12,d5[0]
+ vmlal.s16 q10,d13,d2[2]
+ vmlal.s16 q11,d12,d1[0]
+ vmlsl.s16 q11,d13,d7[2]
+ vmlsl.s16 q8,d12,d7[0]
+ vmlsl.s16 q8,d13,d3[2]
+ vmlsl.s16 q9,d12,d3[0]
+ vmlal.s16 q9,d13,d1[2]
+
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+
+ vmlsl.s16 q12,d8,d5[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d6[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d3[1] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d0[1] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d0[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d4[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d6[0]
+ vmlsl.s16 q10,d11,d1[2]
+
+
+ vmlsl.s16 q11,d10,d2[0]
+ vmlal.s16 q11,d11,d4[2]
+
+ vmlal.s16 q8,d10,d2[0]
+ vmlsl.s16 q8,d11,d7[2]
+
+ vmlsl.s16 q9,d10,d6[0]
+ vmlsl.s16 q9,d11,d5[2]
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+ vmlal.s16 q12,d14,d4[3]
+ vmlsl.s16 q13,d14,d6[1]
+ vmlal.s16 q14,d14,d7[3]
+ vmlal.s16 q15,d14,d6[3]
+
+
+ vmlal.s16 q12,d15,d3[3]
+ vmlsl.s16 q13,d15,d3[1]
+ vmlal.s16 q14,d15,d2[3]
+ vmlsl.s16 q15,d15,d2[1]
+
+
+ vmlsl.s16 q10,d12,d7[0]
+ vmlal.s16 q10,d13,d0[2]
+ vmlal.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d1[2]
+ vmlsl.s16 q8,d12,d3[0]
+ vmlal.s16 q8,d13,d2[2]
+ vmlal.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d3[2]
+
+shift3:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q5,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q7,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d18,q13,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d12,q6,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q12,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d13,q8,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q14,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+ vst1.16 {q15},[r1]!
+ vst1.16 {q6},[r1]!
+ add r1,r1,#64
+ vst1.16 {q7},[r1]!
+ vst1.16 {q9},[r1]!
+ sub r1,r1,#96
+
+ mov r0,r8
+
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+ vmull.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d6[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d7[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d7[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d5[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d7[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d3[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d2[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d1[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d0[2]
+
+ cmp r11,r10
+ bhs shift4
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+
+ vmlal.s16 q12,d14,d0[1]
+ vmlal.s16 q13,d14,d1[3]
+ vmlal.s16 q14,d14,d4[1]
+ vmlal.s16 q15,d14,d6[3]
+
+
+ vmlsl.s16 q12,d15,d4[1]
+ vmlsl.s16 q13,d15,d0[3]
+ vmlsl.s16 q14,d15,d2[3]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlal.s16 q10,d12,d7[0]
+ vmlal.s16 q10,d13,d5[2]
+ vmlal.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d7[2]
+ vmlal.s16 q8,d12,d3[0]
+ vmlsl.s16 q8,d13,d4[2]
+ vmlal.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d1[2]
+
+ cmp r11,r9
+ bhs shift4
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+ vmlal.s16 q12,d8,d7[3] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d3[1] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d1[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d5[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d4[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d5[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d5[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d2[0]
+ vmlal.s16 q10,d11,d1[2]
+
+
+ vmlsl.s16 q11,d10,d6[0]
+ vmlal.s16 q11,d11,d3[2]
+
+ vmlal.s16 q8,d10,d6[0]
+ vmlsl.s16 q8,d11,d7[2]
+
+ vmlal.s16 q9,d10,d2[0]
+ vmlsl.s16 q9,d11,d2[2]
+
+ cmp r11,r5
+ bhs shift4
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+
+ vmlsl.s16 q12,d14,d1[1]
+ vmlsl.s16 q13,d14,d7[3]
+ vmlal.s16 q14,d14,d1[3]
+ vmlal.s16 q15,d14,d4[3]
+
+
+ vmlal.s16 q12,d15,d2[1]
+ vmlal.s16 q13,d15,d5[1]
+ vmlsl.s16 q14,d15,d3[1]
+ vmlsl.s16 q15,d15,d4[1]
+
+
+ vmlsl.s16 q10,d12,d5[0]
+ vmlsl.s16 q10,d13,d7[2]
+ vmlsl.s16 q11,d12,d1[0]
+ vmlal.s16 q11,d13,d1[2]
+ vmlsl.s16 q8,d12,d7[0]
+ vmlal.s16 q8,d13,d5[2]
+ vmlal.s16 q9,d12,d3[0]
+ vmlsl.s16 q9,d13,d3[2]
+
+ cmp r11,r7
+ bhs shift4
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+ vmlsl.s16 q12,d8,d5[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d2[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d4[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d3[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d6[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d0[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d6[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d0[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlal.s16 q11,d11,d6[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d2[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d4[2]
+
+
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+
+
+ vmlal.s16 q12,d14,d3[1]
+ vmlsl.s16 q13,d14,d2[1]
+ vmlal.s16 q14,d14,d7[3]
+ vmlal.s16 q15,d14,d2[3]
+
+
+ vmlsl.s16 q12,d15,d0[3]
+ vmlal.s16 q13,d15,d4[3]
+ vmlal.s16 q14,d15,d6[3]
+ vmlsl.s16 q15,d15,d2[1]
+
+
+ vmlal.s16 q10,d12,d3[0]
+ vmlsl.s16 q10,d13,d6[2]
+ vmlal.s16 q11,d12,d7[0]
+ vmlsl.s16 q11,d13,d4[2]
+ vmlsl.s16 q8,d12,d1[0]
+ vmlal.s16 q8,d13,d0[2]
+ vmlal.s16 q9,d12,d5[0]
+ vmlsl.s16 q9,d13,d5[2]
+
+
+ vld1.16 d10,[r0],r6
+ vld1.16 d8,[r0],r6
+ vld1.16 d11,[r0],r6
+ vld1.16 d9,[r0],r6
+
+
+
+
+
+ vmlal.s16 q12,d8,d3[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d7[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d5[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d7[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d6[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d6[0]
+ vmlal.s16 q10,d11,d2[2]
+
+
+ vmlal.s16 q11,d10,d2[0]
+ vmlsl.s16 q11,d11,d0[2]
+
+ vmlsl.s16 q8,d10,d2[0]
+ vmlal.s16 q8,d11,d3[2]
+
+ vmlal.s16 q9,d10,d6[0]
+ vmlsl.s16 q9,d11,d6[2]
+
+
+ vld1.16 d12,[r0],r6
+ vld1.16 d14,[r0],r6
+ vld1.16 d13,[r0],r6
+ vld1.16 d15,[r0],r6
+
+
+
+
+ vmlsl.s16 q12,d14,d5[1]
+ vmlal.s16 q13,d14,d3[3]
+ vmlsl.s16 q14,d14,d2[1]
+ vmlal.s16 q15,d14,d0[3]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlsl.s16 q13,d15,d1[1]
+ vmlal.s16 q14,d15,d0[3]
+ vmlsl.s16 q15,d15,d0[1]
+
+
+ vmlsl.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d4[2]
+ vmlal.s16 q11,d12,d3[0]
+ vmlsl.s16 q11,d13,d5[2]
+ vmlsl.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d6[2]
+ vmlal.s16 q9,d12,d7[0]
+ vmlsl.s16 q9,d13,d7[2]
+
+shift4:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d19,q5,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d31,q7,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d18,q13,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d12,q6,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q12,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d13,q8,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q14,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+ vst1.16 {q15},[r1]!
+ vst1.16 {q6},[r1]!
+ vst1.16 {q7},[r1]!
+ vst1.16 {q9},[r1]!
+
+ add r1,r1,#96
+
+ subs r14,r14,#1
+ bne dct_stage1
+second_stage_dct:
+@ mov r0,r1
+ ldr r0,[sp]
+ ldr r1,[sp,#4]
+ ldr r8,[sp,#60] @ prediction stride
+ ldr r7,[sp,#64] @ destination stride
+
+@ add r4,r2,r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
+@ add r5,r8,r8, lsl #1 @
+@ sub r0,r0,#512
+ mov r11,#0xfffffff0
+ mov r5, #0xffffff00
+ ldr r6,r5_addr
+ ldr r9,r9_addr
+@ sub r1,r1,#2048
+ mov r4,r1
+ mov r10,#240
+ mov r14,#8
+ b stage2
+
+@ registers free :
+
+@ arm registers used
+@ r8 : predicition stride
+@ r7 : destination stride
+@ r1: temp buffer
+@ r2 : pred buffer
+@ r3 : destination buffer
+@ r14 : loop counter
+@r0 : scratch buffer
+@r10 : used as stride
+@ r4 : used to store the initial address
+@r12 : zero cols
+@ r11 : 0xfffffff0
+@ r5 : 0xffffff00
+dct_stage2:
+ add r4,r4,#32
+ mov r1,r4
+stage2:
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+ vmull.s16 q12,d8,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d2[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d5[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlal.s16 q10,d11,d0[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlal.s16 q11,d11,d1[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d2[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d3[2]
+ cmp r12,r11
+ bhs stage2_shift1
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+
+
+ vmlal.s16 q12,d14,d1[1]
+ vmlal.s16 q13,d14,d3[3]
+ vmlal.s16 q14,d14,d6[1]
+ vmlsl.s16 q15,d14,d7[1]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlal.s16 q13,d15,d5[1]
+ vmlsl.s16 q14,d15,d7[1]
+ vmlsl.s16 q15,d15,d3[3]
+
+
+ vmlal.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d1[2]
+ vmlal.s16 q11,d12,d3[0]
+ vmlal.s16 q11,d13,d4[2]
+ vmlal.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d7[2]
+ vmlal.s16 q9,d12,d7[0]
+ vmlsl.s16 q9,d13,d5[2]
+ cmp r12,r5
+ bhs stage2_shift1
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+ vmlal.s16 q12,d8,d2[1] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d6[3] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d4[3] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d0[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d7[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d2[0]
+ vmlal.s16 q10,d11,d2[2]
+
+
+ vmlal.s16 q11,d10,d6[0]
+ vmlal.s16 q11,d11,d7[2]
+
+ vmlsl.s16 q8,d10,d6[0]
+ vmlsl.s16 q8,d11,d3[2]
+
+ vmlsl.s16 q9,d10,d2[0]
+ vmlsl.s16 q9,d11,d1[2]
+
+ cmp r12,r6
+ bhs stage2_shift1
+
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+
+ vmlal.s16 q12,d14,d3[1]
+ vmlsl.s16 q13,d14,d6[1]
+ vmlsl.s16 q14,d14,d0[1]
+ vmlsl.s16 q15,d14,d6[3]
+
+
+ vmlal.s16 q12,d15,d3[3]
+ vmlsl.s16 q13,d15,d4[3]
+ vmlsl.s16 q14,d15,d2[3]
+ vmlal.s16 q15,d15,d5[3]
+
+
+ vmlal.s16 q10,d12,d3[0]
+ vmlal.s16 q10,d13,d3[2]
+ vmlsl.s16 q11,d12,d7[0]
+ vmlsl.s16 q11,d13,d5[2]
+ vmlsl.s16 q8,d12,d1[0]
+ vmlsl.s16 q8,d13,d1[2]
+ vmlsl.s16 q9,d12,d5[0]
+ vmlal.s16 q9,d13,d7[2]
+
+ cmp r12,r9
+ bhs stage2_shift1
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+ vmlal.s16 q12,d8,d4[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d3[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d5[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d2[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d4[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d7[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlal.s16 q10,d11,d4[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d2[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d6[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d0[2]
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+
+ vmlal.s16 q12,d14,d5[1]
+ vmlsl.s16 q13,d14,d0[2]
+ vmlal.s16 q14,d14,d5[3]
+ vmlal.s16 q15,d14,d4[3]
+
+
+ vmlal.s16 q12,d15,d5[3]
+ vmlsl.s16 q13,d15,d1[1]
+ vmlal.s16 q14,d15,d3[1]
+ vmlsl.s16 q15,d15,d7[3]
+
+
+ vmlal.s16 q10,d12,d5[0]
+ vmlal.s16 q10,d13,d5[2]
+ vmlsl.s16 q11,d12,d1[0]
+ vmlsl.s16 q11,d13,d0[2]
+ vmlal.s16 q8,d12,d7[0]
+ vmlal.s16 q8,d13,d4[2]
+ vmlal.s16 q9,d12,d3[0]
+ vmlal.s16 q9,d13,d6[2]
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+
+
+ vmlal.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d2[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d0[1] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d4[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d6[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d1[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d0[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d6[0]
+ vmlal.s16 q10,d11,d6[2]
+
+
+ vmlsl.s16 q11,d10,d2[0]
+ vmlsl.s16 q11,d11,d3[2]
+
+ vmlal.s16 q8,d10,d2[0]
+ vmlal.s16 q8,d11,d0[2]
+
+ vmlsl.s16 q9,d10,d6[0]
+ vmlsl.s16 q9,d11,d2[2]
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+ vmlal.s16 q12,d14,d7[1]
+ vmlsl.s16 q13,d14,d5[3]
+ vmlal.s16 q14,d14,d4[1]
+ vmlsl.s16 q15,d14,d2[3]
+
+
+ vmlal.s16 q12,d15,d7[3]
+ vmlsl.s16 q13,d15,d7[1]
+ vmlal.s16 q14,d15,d6[3]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlal.s16 q10,d12,d7[0]
+ vmlal.s16 q10,d13,d7[2]
+ vmlsl.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d6[2]
+ vmlal.s16 q8,d12,d3[0]
+ vmlal.s16 q8,d13,d5[2]
+ vmlsl.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d4[2]
+
+stage2_shift1:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage2_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d19,q5,#shift_stage2_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d31,q7,#shift_stage2_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d18,q13,#shift_stage2_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d12,q6,#shift_stage2_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d15,q12,#shift_stage2_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d13,q8,#shift_stage2_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d14,q14,#shift_stage2_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+ vst1.16 {q15},[r0]!
+ vst1.16 {q6},[r0]!
+ vst1.16 {q7},[r0]!
+ vst1.16 {q9},[r0]!
+
+
+ mov r1,r4
+
+
+
+
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+ vmull.s16 q12,d8,d2[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d2[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d3[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d3[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d6[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d7[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d6[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d4[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlal.s16 q10,d11,d4[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlal.s16 q11,d11,d5[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d6[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d7[2]
+
+ cmp r12,r11
+ bhs stage2_shift2
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+ vmlsl.s16 q12,d14,d4[3]
+ vmlsl.s16 q13,d14,d2[1]
+ vmlsl.s16 q14,d14,d0[1]
+ vmlsl.s16 q15,d14,d2[3]
+
+
+ vmlsl.s16 q12,d15,d0[3]
+ vmlsl.s16 q13,d15,d3[1]
+ vmlsl.s16 q14,d15,d6[3]
+ vmlal.s16 q15,d15,d5[3]
+
+
+ vmlsl.s16 q10,d12,d7[0]
+ vmlsl.s16 q10,d13,d2[2]
+ vmlsl.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d0[2]
+ vmlsl.s16 q8,d12,d3[0]
+ vmlsl.s16 q8,d13,d3[2]
+ vmlsl.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d6[2]
+
+ cmp r12,r5
+ bhs stage2_shift2
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+
+
+
+ vmlsl.s16 q12,d8,d4[1] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d7[1] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d2[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d7[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d6[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d2[0]
+ vmlsl.s16 q10,d11,d6[2]
+
+
+ vmlsl.s16 q11,d10,d6[0]
+ vmlal.s16 q11,d11,d4[2]
+
+ vmlal.s16 q8,d10,d6[0]
+ vmlal.s16 q8,d11,d0[2]
+
+ vmlal.s16 q9,d10,d2[0]
+ vmlal.s16 q9,d11,d5[2]
+
+ cmp r12,r6
+ bhs stage2_shift2
+
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+
+
+ vmlal.s16 q12,d14,d2[3]
+ vmlal.s16 q13,d14,d3[3]
+ vmlsl.s16 q14,d14,d5[3]
+ vmlsl.s16 q15,d14,d0[3]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlsl.s16 q13,d15,d6[3]
+ vmlsl.s16 q14,d15,d0[3]
+ vmlal.s16 q15,d15,d7[3]
+
+
+ vmlal.s16 q10,d12,d5[0]
+ vmlal.s16 q10,d13,d0[2]
+ vmlal.s16 q11,d12,d1[0]
+ vmlal.s16 q11,d13,d6[2]
+ vmlal.s16 q8,d12,d7[0]
+ vmlsl.s16 q8,d13,d2[2]
+ vmlsl.s16 q9,d12,d3[0]
+ vmlsl.s16 q9,d13,d4[2]
+
+ cmp r12,r9
+ bhs stage2_shift2
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+
+ vmlal.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d1[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d7[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d0[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d5[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d7[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d7[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d1[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d5[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d3[2]
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+ vmlsl.s16 q12,d14,d0[1]
+ vmlal.s16 q13,d14,d6[1]
+ vmlal.s16 q14,d14,d4[1]
+ vmlsl.s16 q15,d14,d1[1]
+
+
+ vmlsl.s16 q12,d15,d3[3]
+ vmlal.s16 q13,d15,d0[1]
+ vmlsl.s16 q14,d15,d5[1]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlsl.s16 q10,d12,d3[0]
+ vmlsl.s16 q10,d13,d1[2]
+ vmlsl.s16 q11,d12,d7[0]
+ vmlal.s16 q11,d13,d3[2]
+ vmlal.s16 q8,d12,d1[0]
+ vmlal.s16 q8,d13,d7[2]
+ vmlsl.s16 q9,d12,d5[0]
+ vmlsl.s16 q9,d13,d2[2]
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+ vmlal.s16 q12,d8,d7[3] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d4[3] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d1[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d2[1] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d3[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d5[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d7[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d5[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d6[0]
+ vmlal.s16 q10,d11,d5[2]
+
+
+ vmlal.s16 q11,d10,d2[0]
+ vmlal.s16 q11,d11,d7[2]
+
+ vmlsl.s16 q8,d10,d2[0]
+ vmlsl.s16 q8,d11,d4[2]
+
+ vmlal.s16 q9,d10,d6[0]
+ vmlal.s16 q9,d11,d1[2]
+
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+ vmlal.s16 q12,d14,d1[1]
+ vmlsl.s16 q13,d14,d0[3]
+ vmlal.s16 q14,d14,d1[3]
+ vmlsl.s16 q15,d14,d3[1]
+
+
+ vmlal.s16 q12,d15,d5[3]
+ vmlsl.s16 q13,d15,d5[1]
+ vmlal.s16 q14,d15,d4[3]
+ vmlsl.s16 q15,d15,d4[1]
+
+
+ vmlal.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d3[2]
+ vmlsl.s16 q11,d12,d3[0]
+ vmlsl.s16 q11,d13,d2[2]
+ vmlal.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d1[2]
+ vmlsl.s16 q9,d12,d7[0]
+ vmlsl.s16 q9,d13,d0[2]
+
+stage2_shift2:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage2_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d19,q5,#shift_stage2_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d31,q7,#shift_stage2_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d18,q13,#shift_stage2_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d12,q6,#shift_stage2_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d15,q12,#shift_stage2_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d13,q8,#shift_stage2_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d14,q14,#shift_stage2_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+ vst1.16 {q15},[r0]!
+ vst1.16 {q6},[r0]!
+ vst1.16 {q7},[r0]!
+ vst1.16 {q9},[r0]!
+
+
+
+ mov r1,r4
+
+
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+ vmull.s16 q12,d8,d4[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d4[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d5[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d5[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d3[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d0[2] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d7[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d6[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d5[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d4[2]
+
+ cmp r12,r11
+ bhs stage2_shift3
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+ vmlsl.s16 q12,d14,d5[1]
+ vmlsl.s16 q13,d14,d7[3]
+ vmlal.s16 q14,d14,d5[3]
+ vmlal.s16 q15,d14,d3[1]
+
+
+ vmlal.s16 q12,d15,d2[1]
+ vmlal.s16 q13,d15,d1[1]
+ vmlal.s16 q14,d15,d4[3]
+ vmlsl.s16 q15,d15,d7[3]
+
+
+ vmlsl.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d6[2]
+ vmlsl.s16 q11,d12,d3[0]
+ vmlal.s16 q11,d13,d3[2]
+ vmlsl.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d0[2]
+ vmlsl.s16 q9,d12,d7[0]
+ vmlal.s16 q9,d13,d2[2]
+
+ cmp r12,r5
+ bhs stage2_shift3
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+
+ vmlal.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d5[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d0[3] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d3[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d1[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d6[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d0[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d2[0]
+ vmlsl.s16 q10,d11,d5[2]
+
+
+ vmlal.s16 q11,d10,d6[0]
+ vmlsl.s16 q11,d11,d0[2]
+
+ vmlsl.s16 q8,d10,d6[0]
+ vmlsl.s16 q8,d11,d4[2]
+
+ vmlsl.s16 q9,d10,d2[0]
+ vmlal.s16 q9,d11,d6[2]
+
+ cmp r12,r6
+ bhs stage2_shift3
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+
+ vmlsl.s16 q12,d14,d7[1]
+ vmlal.s16 q13,d14,d2[1]
+ vmlal.s16 q14,d14,d4[1]
+ vmlsl.s16 q15,d14,d5[1]
+
+
+ vmlal.s16 q12,d15,d0[3]
+ vmlal.s16 q13,d15,d7[1]
+ vmlsl.s16 q14,d15,d1[1]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlsl.s16 q10,d12,d3[0]
+ vmlal.s16 q10,d13,d4[2]
+ vmlal.s16 q11,d12,d7[0]
+ vmlal.s16 q11,d13,d2[2]
+ vmlal.s16 q8,d12,d1[0]
+ vmlsl.s16 q8,d13,d6[2]
+ vmlal.s16 q9,d12,d5[0]
+ vmlsl.s16 q9,d13,d0[2]
+
+ cmp r12,r9
+ bhs stage2_shift3
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+ vmlsl.s16 q12,d8,d7[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d0[1] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d6[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d0[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d5[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d2[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d3[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d5[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d1[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlal.s16 q9,d11,d7[2]
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+ vmlal.s16 q12,d14,d6[3]
+ vmlal.s16 q13,d14,d3[3]
+ vmlsl.s16 q14,d14,d1[3]
+ vmlal.s16 q15,d14,d7[1]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlsl.s16 q13,d15,d2[3]
+ vmlal.s16 q14,d15,d7[1]
+ vmlal.s16 q15,d15,d4[1]
+
+
+ vmlsl.s16 q10,d12,d5[0]
+ vmlal.s16 q10,d13,d2[2]
+ vmlal.s16 q11,d12,d1[0]
+ vmlsl.s16 q11,d13,d7[2]
+ vmlsl.s16 q8,d12,d7[0]
+ vmlsl.s16 q8,d13,d3[2]
+ vmlsl.s16 q9,d12,d3[0]
+ vmlal.s16 q9,d13,d1[2]
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+ vmlsl.s16 q12,d8,d5[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d6[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d3[1] @// y1 * sin3(part of b2)
+ vmlsl.s16 q15,d8,d0[1] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d0[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d2[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlal.s16 q15,d9,d4[3] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d6[0]
+ vmlsl.s16 q10,d11,d1[2]
+
+
+ vmlsl.s16 q11,d10,d2[0]
+ vmlal.s16 q11,d11,d4[2]
+
+ vmlal.s16 q8,d10,d2[0]
+ vmlsl.s16 q8,d11,d7[2]
+
+ vmlsl.s16 q9,d10,d6[0]
+ vmlsl.s16 q9,d11,d5[2]
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+ vmlal.s16 q12,d14,d4[3]
+ vmlsl.s16 q13,d14,d6[1]
+ vmlal.s16 q14,d14,d7[3]
+ vmlal.s16 q15,d14,d6[3]
+
+
+ vmlal.s16 q12,d15,d3[3]
+ vmlsl.s16 q13,d15,d3[1]
+ vmlal.s16 q14,d15,d2[3]
+ vmlsl.s16 q15,d15,d2[1]
+
+
+ vmlsl.s16 q10,d12,d7[0]
+ vmlal.s16 q10,d13,d0[2]
+ vmlal.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d1[2]
+ vmlsl.s16 q8,d12,d3[0]
+ vmlal.s16 q8,d13,d2[2]
+ vmlal.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d3[2]
+
+stage2_shift3:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage2_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d19,q5,#shift_stage2_idct @// r11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d31,q7,#shift_stage2_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d18,q13,#shift_stage2_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d12,q6,#shift_stage2_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d15,q12,#shift_stage2_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d13,q8,#shift_stage2_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d14,q14,#shift_stage2_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+ vst1.16 {q15},[r0]!
+ vst1.16 {q6},[r0]!
+ vst1.16 {q7},[r0]!
+ vst1.16 {q9},[r0]!
+
+
+
+
+ mov r1,r4
+
+
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+ vmull.s16 q12,d8,d6[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d6[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d7[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d7[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d2[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d4[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d5[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d7[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmull.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d3[2]
+
+
+ vmull.s16 q11,d10,d0[0]
+ vmlsl.s16 q11,d11,d2[2]
+
+ vmull.s16 q8,d10,d0[0]
+ vmlsl.s16 q8,d11,d1[2]
+
+ vmull.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d0[2]
+
+ cmp r12,r11
+ bhs stage2_shift4
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+
+
+ vmlal.s16 q12,d14,d0[1]
+ vmlal.s16 q13,d14,d1[3]
+ vmlal.s16 q14,d14,d4[1]
+ vmlal.s16 q15,d14,d6[3]
+
+
+ vmlsl.s16 q12,d15,d4[1]
+ vmlsl.s16 q13,d15,d0[3]
+ vmlsl.s16 q14,d15,d2[3]
+ vmlsl.s16 q15,d15,d6[1]
+
+
+ vmlal.s16 q10,d12,d7[0]
+ vmlal.s16 q10,d13,d5[2]
+ vmlal.s16 q11,d12,d5[0]
+ vmlsl.s16 q11,d13,d7[2]
+ vmlal.s16 q8,d12,d3[0]
+ vmlsl.s16 q8,d13,d4[2]
+ vmlal.s16 q9,d12,d1[0]
+ vmlsl.s16 q9,d13,d1[2]
+
+ cmp r12,r5
+ bhs stage2_shift4
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+
+ vmlal.s16 q12,d8,d7[3] @// y1 * cos1(part of b0)
+ vmlal.s16 q13,d8,d3[1] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d1[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d5[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d4[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d5[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d5[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d2[0]
+ vmlal.s16 q10,d11,d1[2]
+
+
+ vmlsl.s16 q11,d10,d6[0]
+ vmlal.s16 q11,d11,d3[2]
+
+ vmlal.s16 q8,d10,d6[0]
+ vmlsl.s16 q8,d11,d7[2]
+
+ vmlal.s16 q9,d10,d2[0]
+ vmlsl.s16 q9,d11,d2[2]
+
+ cmp r12,r6
+ bhs stage2_shift4
+
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+
+
+ vmlsl.s16 q12,d14,d1[1]
+ vmlsl.s16 q13,d14,d7[3]
+ vmlal.s16 q14,d14,d1[3]
+ vmlal.s16 q15,d14,d4[3]
+
+
+ vmlal.s16 q12,d15,d2[1]
+ vmlal.s16 q13,d15,d5[1]
+ vmlsl.s16 q14,d15,d3[1]
+ vmlsl.s16 q15,d15,d4[1]
+
+
+ vmlsl.s16 q10,d12,d5[0]
+ vmlsl.s16 q10,d13,d7[2]
+ vmlsl.s16 q11,d12,d1[0]
+ vmlal.s16 q11,d13,d1[2]
+ vmlsl.s16 q8,d12,d7[0]
+ vmlal.s16 q8,d13,d5[2]
+ vmlal.s16 q9,d12,d3[0]
+ vmlsl.s16 q9,d13,d3[2]
+
+ cmp r12,r9
+ bhs stage2_shift4
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+ vmlsl.s16 q12,d8,d5[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d2[3] @// y1 * cos3(part of b1)
+ vmlal.s16 q14,d8,d4[3] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d3[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d6[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlal.s16 q13,d9,d0[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d6[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d3[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlal.s16 q10,d10,d0[0]
+ vmlsl.s16 q10,d11,d0[2]
+
+
+ vmlsl.s16 q11,d10,d0[0]
+ vmlal.s16 q11,d11,d6[2]
+
+ vmlsl.s16 q8,d10,d0[0]
+ vmlal.s16 q8,d11,d2[2]
+
+ vmlal.s16 q9,d10,d0[0]
+ vmlsl.s16 q9,d11,d4[2]
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+
+ vmlal.s16 q12,d14,d3[1]
+ vmlsl.s16 q13,d14,d2[1]
+ vmlal.s16 q14,d14,d7[3]
+ vmlal.s16 q15,d14,d2[3]
+
+
+ vmlsl.s16 q12,d15,d0[3]
+ vmlal.s16 q13,d15,d4[3]
+ vmlal.s16 q14,d15,d6[3]
+ vmlsl.s16 q15,d15,d2[1]
+
+
+ vmlal.s16 q10,d12,d3[0]
+ vmlsl.s16 q10,d13,d6[2]
+ vmlal.s16 q11,d12,d7[0]
+ vmlsl.s16 q11,d13,d4[2]
+ vmlsl.s16 q8,d12,d1[0]
+ vmlal.s16 q8,d13,d0[2]
+ vmlal.s16 q9,d12,d5[0]
+ vmlsl.s16 q9,d13,d5[2]
+
+
+ vld1.16 {d10,d11},[r1]!
+ vld1.16 {d8,d9},[r1],r10
+
+
+
+
+ vmlal.s16 q12,d8,d3[3] @// y1 * cos1(part of b0)
+ vmlsl.s16 q13,d8,d7[1] @// y1 * cos3(part of b1)
+ vmlsl.s16 q14,d8,d5[1] @// y1 * sin3(part of b2)
+ vmlal.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlsl.s16 q12,d9,d7[1] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d6[1] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlal.s16 q14,d9,d3[3] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ vmlsl.s16 q10,d10,d6[0]
+ vmlal.s16 q10,d11,d2[2]
+
+
+ vmlal.s16 q11,d10,d2[0]
+ vmlsl.s16 q11,d11,d0[2]
+
+ vmlsl.s16 q8,d10,d2[0]
+ vmlal.s16 q8,d11,d3[2]
+
+ vmlal.s16 q9,d10,d6[0]
+ vmlsl.s16 q9,d11,d6[2]
+
+
+ vld1.16 {d12,d13},[r1]!
+ vld1.16 {d14,d15},[r1],r10
+
+
+
+ vmlsl.s16 q12,d14,d5[1]
+ vmlal.s16 q13,d14,d3[3]
+ vmlsl.s16 q14,d14,d2[1]
+ vmlal.s16 q15,d14,d0[3]
+
+
+ vmlal.s16 q12,d15,d1[3]
+ vmlsl.s16 q13,d15,d1[1]
+ vmlal.s16 q14,d15,d0[3]
+ vmlsl.s16 q15,d15,d0[1]
+
+
+ vmlsl.s16 q10,d12,d1[0]
+ vmlal.s16 q10,d13,d4[2]
+ vmlal.s16 q11,d12,d3[0]
+ vmlsl.s16 q11,d13,d5[2]
+ vmlsl.s16 q8,d12,d5[0]
+ vmlal.s16 q8,d13,d6[2]
+ vmlal.s16 q9,d12,d7[0]
+ vmlsl.s16 q9,d13,d7[2]
+
+stage2_shift4:
+ vadd.s32 q4,q10,q12
+ vsub.s32 q5,q10,q12
+
+ vadd.s32 q6,q11,q13
+ vsub.s32 q12,q11,q13
+
+ vadd.s32 q7,q8,q14
+ vsub.s32 q13,q8,q14
+
+
+ vadd.s32 q8,q9,q15
+ vsub.s32 q14,q9,q15
+
+
+ vqrshrn.s32 d30,q4,#shift_stage2_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d19,q5,#shift_stage2_idct @// r11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d31,q7,#shift_stage2_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d18,q13,#shift_stage2_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d12,q6,#shift_stage2_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d15,q12,#shift_stage2_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d13,q8,#shift_stage2_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ vqrshrn.s32 d14,q14,#shift_stage2_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+
+ vtrn.16 q15,q6
+ vtrn.16 q7,q9
+
+ vtrn.32 d30,d31
+ vtrn.32 d12,d13
+ vtrn.32 d14,d15
+ vtrn.32 d18,d19
+
+
+
+ vst1.16 {q15},[r0]!
+ vst1.16 {q6},[r0]!
+ vst1.16 {q7},[r0]!
+ vst1.16 {q9},[r0]!
+
+
+
+
+ sub r0,r0,#256
+prediction_buffer:
+
+
+ vld1.16 {d12,d13},[r0]!
+ vld1.16 {d14,d15},[r0]!
+
+ add r0,r0,#32
+
+ vld1.16 {d16,d17},[r0]!
+ vld1.16 {d18,d19},[r0]!
+ add r0,r0,#32
+
+ vld1.16 {d20,d21},[r0]!
+ vld1.16 {d22,d23},[r0]!
+
+
+ add r0,r0,#32
+
+ vld1.16 {d24,d25},[r0]!
+ vld1.16 {d26,d27},[r0]!
+
+
+
+
+
+@ d12 =r0 1- 4 values
+@ d13 =r2 1- 4 values
+@ d14=r1 1- 4 values
+@ d15=r3 1- 4 values
+
+@ d16 =r0 5- 8 values
+@ d17 =r2 5- 8 values
+@ d18=r1 5- 8 values
+@ d19=r3 5- 8 values
+
+@ d20 =r0 9- 12 values
+@ d21 =r2 9- 12 values
+@ d22=r1 9- 12 values
+@ d23=r3 9- 12 values
+
+@ d24 =r0 13-16 values
+@ d25 =r2 13- 16 values
+@ d26=r1 13- 16 values
+@ d27=r3 13- 16 values
+
+ vswp d13,d16
+ vswp d21,d24
+ vswp d15,d18
+ vswp d23,d26
+
+
+ vld1.8 {d8,d9},[r2],r8
+ vld1.8 {d10,d11},[r2],r8
+ vld1.8 {d28,d29},[r2],r8
+ vld1.8 {d30,d31},[r2],r8
+
+
+ vaddw.u8 q6,q6,d8
+ vaddw.u8 q10,q10,d9
+ vaddw.u8 q7,q7,d10
+ vaddw.u8 q11,q11,d11
+ vaddw.u8 q8,q8,d28
+ vaddw.u8 q12,q12,d29
+ vaddw.u8 q9,q9,d30
+ vaddw.u8 q13,q13,d31
+ sub r2,r2,r8,lsl #2
+ add r2,r2,#16
+ vqmovun.s16 d12,q6
+ vqmovun.s16 d13,q10
+ vqmovun.s16 d20,q7
+ vqmovun.s16 d21,q11
+ vqmovun.s16 d14,q8
+ vqmovun.s16 d15,q12
+ vqmovun.s16 d22,q9
+ vqmovun.s16 d23,q13
+
+
+ vst1.8 {d12,d13},[r3],r7
+ vst1.8 {d20,d21},[r3],r7
+ vst1.8 {d14,d15},[r3],r7
+ vst1.8 {d22,d23},[r3],r7
+
+
+ sub r3,r3,r7,lsl #2
+ add r3,r3,#16
+
+ vld1.16 {d12,d13},[r0]!
+ vld1.16 {d14,d15},[r0]!
+
+ sub r0,r0,#96
+
+ vld1.16 {d16,d17},[r0]!
+ vld1.16 {d18,d19},[r0]!
+ sub r0,r0,#96
+
+ vld1.16 {d20,d21},[r0]!
+ vld1.16 {d22,d23},[r0]!
+
+
+ sub r0,r0,#96
+
+ vld1.16 {d24,d25},[r0]!
+ vld1.16 {d26,d27},[r0]!
+
+
+ sub r0,r0,#64
+
+
+
+
+ vswp d13,d16
+ vswp d21,d24
+ vswp d15,d18
+ vswp d23,d26
+
+
+ vld1.8 {d8,d9},[r2],r8
+ vld1.8 {d10,d11},[r2],r8
+ vld1.8 {d28,d29},[r2],r8
+ vld1.8 {d30,d31},[r2],r8
+
+
+ vaddw.u8 q6,q6,d8
+ vaddw.u8 q10,q10,d9
+ vaddw.u8 q7,q7,d10
+ vaddw.u8 q11,q11,d11
+ vaddw.u8 q8,q8,d28
+ vaddw.u8 q12,q12,d29
+ vaddw.u8 q9,q9,d30
+ vaddw.u8 q13,q13,d31
+ sub r2,r2,#16
+
+ vqmovun.s16 d12,q6
+ vqmovun.s16 d13,q10
+ vqmovun.s16 d20,q7
+ vqmovun.s16 d21,q11
+ vqmovun.s16 d14,q8
+ vqmovun.s16 d15,q12
+ vqmovun.s16 d22,q9
+ vqmovun.s16 d23,q13
+
+
+ vst1.8 {d12,d13},[r3],r7
+ vst1.8 {d20,d21},[r3],r7
+ vst1.8 {d14,d15},[r3],r7
+ vst1.8 {d22,d23},[r3],r7
+
+ sub r3,r3,#16
+
+ subs r14,r14,#1
+ bne dct_stage2
+ ldmfd sp!,{r0-r12,pc}
+
+
+
+
+
diff --git a/common/arm/ihevc_itrans_recon_4x4.s b/common/arm/ihevc_itrans_recon_4x4.s
new file mode 100644
index 0000000..c955502
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_4x4.s
@@ -0,0 +1,232 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ihevc_itrans_recon_4x4_neon.s
+@ *
+@ * @brief
+@ * contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * naveen sr
+@ *
+@ * @par list of functions:
+@ * - ihevc_itrans_recon_4x4()
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@*/
+@ /**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * this function performs inverse transform and reconstruction for 4x4
+@ * input block
+@ *
+@ * @par description:
+@ * performs inverse transform and adds the prediction data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ * input 4x4 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ * temporary 4x4 buffer for storing inverse
+@ *
+@ * transform
+@ * 1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ * prediction 4x4 block
+@ *
+@ * @param[out] pu1_dst
+@ * output 4x4 block
+@ *
+@ * @param[in] src_strd
+@ * input stride
+@ *
+@ * @param[in] pred_strd
+@ * prediction stride
+@ *
+@ * @param[in] dst_strd
+@ * output stride
+@ *
+@ * @param[in] shift
+@ * output shift
+@ *
+@ * @param[in] zero_cols
+@ * zero columns in pi2_src
+@ *
+@ * @returns void
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@ */
+@void ihevc_itrans_recon_4x4(word16 *pi2_src,
+@ word16 *pi2_tmp,
+@ uword8 *pu1_pred,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 pred_strd,
+@ word32 dst_strd,
+@ word32 zero_cols)
+@**************variables vs registers*************************
+@ r0 => *pi2_src
+@ r1 => *pi2_tmp
+@ r2 => *pu1_pred
+@ r3 => *pu1_dst
+@ r4 => src_strd
+@ r5 => pred_strd
+@ r6 => dst_strd
+@ r7 => zero_cols
+
+
+.text
+.align 4
+
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+
+
+.globl ihevc_itrans_recon_4x4_a9q
+
+.extern g_ai2_ihevc_trans_4_transpose
+
+g_ai2_ihevc_trans_4_transpose_addr:
+.long g_ai2_ihevc_trans_4_transpose - ulbl1 - 8
+
+.type ihevc_itrans_recon_4x4_a9q, %function
+
+ihevc_itrans_recon_4x4_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r8,g_ai2_ihevc_trans_4_transpose_addr
+ulbl1:
+ add r8,r8,pc
+
+ ldr r4,[sp,#40] @loading src_strd
+ ldr r5,[sp,#44] @loading pred_strd
+ add r4,r4,r4 @ src_strd in terms of word16
+
+ ldr r6,[sp,#48] @loading dst_strd
+ ldr r7,[sp,#52] @loading zero_cols
+ add r9,r0,r4 @ pi2_src[0] + src_strd
+
+
+
+ vld1.16 d4,[r8] @loading first row of g_ai2_ihevc_trans_4_transpose
+ @ d4 = {36,64,83,64}
+ @index = 3 2 1 0
+ add r10,r9,r4, lsl #1 @ 3*src_strd
+ add r4,r4,r4
+ vld1.16 d1,[r9] @loading pi2_src 2nd row
+ vld1.16 d3,[r10] @loading pi2_src 4th row
+ vld1.16 d0,[r0],r4 @loading pi2_src 1st row
+ vld1.16 d2,[r0],r4 @loading pi2_src 3rd row
+
+
+ @ first stage computation starts
+ vmull.s16 q3,d1,d4[1] @83 * pi2_src[1]
+ vmlal.s16 q3,d3,d4[3] @o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+ vmull.s16 q4,d1,d4[3] @36 * pi2_src[1]
+ vld1.32 d22[0], [r2],r5
+ vmlsl.s16 q4,d3,d4[1] @o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+
+ vaddl.s16 q5,d0,d2 @pi2_src[0] + pi2_src[2]
+ vsubl.s16 q6,d0,d2 @pi2_src[0] - pi2_src[2]
+ vshl.s32 q5,q5,#6 @e[0] = 64*(pi2_src[0] + pi2_src[2])
+ vshl.s32 q6,q6,#6 @e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+ vadd.s32 q7,q5,q3 @((e[0] + o[0] )
+ vadd.s32 q8,q6,q4 @((e[1] + o[1])
+ vsub.s32 q9,q6,q4 @((e[1] - o[1])
+ vsub.s32 q10,q5,q3 @((e[0] - o[0])
+
+ vqrshrn.s32 d0,q7,#shift_stage1_idct @pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+ vqrshrn.s32 d1,q8,#shift_stage1_idct @pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+ vqrshrn.s32 d2,q9,#shift_stage1_idct @pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+ vqrshrn.s32 d3,q10,#shift_stage1_idct @pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+
+ vtrn.16 d0,d1
+ vtrn.16 d2,d3
+ vtrn.32 d0,d2
+ vtrn.32 d1,d3
+
+ @ first stage ends
+ @ output in d0,d1,d2,d3
+ @ second stage starts
+ vmull.s16 q3,d1,d4[1] @83 * pi2_src[1]
+ vld1.32 d22[1], [r2],r5
+ vmlal.s16 q3,d3,d4[3] @o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+ vmull.s16 q4,d1,d4[3] @36 * pi2_src[1]
+ vmlsl.s16 q4,d3,d4[1] @o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+ vld1.32 d23[0], [r2],r5
+
+ vaddl.s16 q5,d0,d2 @pi2_src[0] + pi2_src[2]
+ vsubl.s16 q6,d0,d2 @pi2_src[0] - pi2_src[2]
+ vshl.s32 q5,q5,#6 @e[0] = 64*(pi2_src[0] + pi2_src[2])
+ vshl.s32 q6,q6,#6 @e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+
+ vadd.s32 q7,q5,q3 @((e[0] + o[0] )
+ vadd.s32 q8,q6,q4 @((e[1] + o[1])
+ vsub.s32 q9,q6,q4 @((e[1] - o[1])
+ vsub.s32 q10,q5,q3 @((e[0] - o[0])
+
+ vqrshrn.s32 d0,q7,#shift_stage2_idct @pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+ vqrshrn.s32 d1,q8,#shift_stage2_idct @pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+ vqrshrn.s32 d2,q9,#shift_stage2_idct @pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+ vqrshrn.s32 d3,q10,#shift_stage2_idct @pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+ vld1.32 d23[1], [r2],r5
+
+ vtrn.16 d0,d1
+ vtrn.16 d2,d3
+ vtrn.32 d0,d2
+ vtrn.32 d1,d3
+ @ second stage ends
+ @ output in d0,d1,d2,d3
+ @ second stage computation ends
+
+ @ loading pred
+
+ vaddw.u8 q0,q0,d22 @ pi2_out(16bit) + pu1_pred(8bit)
+ vaddw.u8 q1,q1,d23 @ pi2_out(16bit) + pu1_pred(8bit)
+ vqmovun.s16 d0,q0 @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+ vqmovun.s16 d1,q1 @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+ @ storing destination
+ vst1.32 {d0[0]},[r3],r6
+ vst1.32 {d0[1]},[r3],r6
+ vst1.32 {d1[0]},[r3],r6
+ vst1.32 {d1[1]},[r3],r6
+
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
diff --git a/common/arm/ihevc_itrans_recon_4x4_ttype1.s b/common/arm/ihevc_itrans_recon_4x4_ttype1.s
new file mode 100644
index 0000000..ab65dae
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_4x4_ttype1.s
@@ -0,0 +1,236 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ihevc_itrans_recon_4x4_ttype1.s
+@ *
+@ * @brief
+@ * contains function definitions for inverse transform and reconstruction
+@ *
+@ *
+@ * @author
+@ * naveen sr
+@ *
+@ * @par list of functions:
+@ * - ihevc_itrans_recon_4x4_ttype1()
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@ */
+
+@/* all the functions here are replicated from ihevc_itrans.c and modified to */
+@/* include reconstruction */
+@
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * this function performs inverse transform type 1 (dst) and reconstruction
+@ * for 4x4 input block
+@ *
+@ * @par description:
+@ * performs inverse transform and adds the prediction data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ * input 4x4 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ * temporary 4x4 buffer for storing inverse
+@ *
+@ * transform
+@ * 1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ * prediction 4x4 block
+@ *
+@ * @param[out] pu1_dst
+@ * output 4x4 block
+@ *
+@ * @param[in] src_strd
+@ * input stride
+@ *
+@ * @param[in] pred_strd
+@ * prediction stride
+@ *
+@ * @param[in] dst_strd
+@ * output stride
+@ *
+@ * @param[in] zero_cols
+@ * zero columns in pi2_src
+@ *
+@ * @returns void
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@ */
+@void ihevc_itrans_recon_4x4_ttype1(word16 *pi2_src,
+@ word16 *pi2_tmp,
+@ uword8 *pu1_pred,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 pred_strd,
+@ word32 dst_strd,
+@ word32 zero_cols)
+
+@**************variables vs registers*************************
+@ r0 => *pi2_src
+@ r1 => *pi2_tmp
+@ r2 => *pu1_pred
+@ r3 => *pu1_dst
+@ r4 => src_strd
+@ r5 => pred_strd
+@ r6 => dst_strd
+@ r7 => zero_cols
+
+.text
+.align 4
+
+
+
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+.globl ihevc_itrans_recon_4x4_ttype1_a9q
+
+.type ihevc_itrans_recon_4x4_ttype1_a9q, %function
+
+ihevc_itrans_recon_4x4_ttype1_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r4,[sp,#40] @loading src_strd
+ ldr r5,[sp,#44] @loading pred_strd
+ ldr r6,[sp,#48] @loading dst_strd
+ ldr r7,[sp,#52] @loading zero_cols
+
+ add r4,r4,r4 @ src_strd in terms of word16
+
+ mov r8,#29
+ mov r9,#55
+ mov r10,#74
+ mov r11,#84
+ vmov.i16 d4[0],r8
+ vld1.16 d0,[r0],r4 @loading pi2_src 1st row
+ vmov.i16 d4[1],r9
+ vld1.16 d1,[r0],r4 @loading pi2_src 2nd row
+ vmov.i16 d4[2],r10
+ vld1.16 d2,[r0],r4 @loading pi2_src 3rd row
+ vmov.i16 d4[3],r11
+ vld1.16 d3,[r0],r4 @loading pi2_src 4th row
+
+ @ first stage computation starts
+ vmull.s16 q3,d1,d4[2] @74 * pi2_src[1]
+ vmlal.s16 q3,d0,d4[0] @74 * pi2_src[1] + 29 * pi2_src[0]
+ vmlal.s16 q3,d3,d4[1] @74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+ vmlal.s16 q3,d2,d4[3] @pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+ vmull.s16 q4,d1,d4[2] @74 * pi2_src[1]
+ vmlal.s16 q4,d0,d4[1] @74 * pi2_src[1] + 55 * pi2_src[0]
+ vmlsl.s16 q4,d2,d4[0] @74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ vmlsl.s16 q4,d3,d4[3] @pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+
+ vmull.s16 q5,d0,d4[2] @ 74 * pi2_src[0]
+ vmlsl.s16 q5,d2,d4[2] @ 74 * pi2_src[0] - 74 * pi2_src[2]
+ vmlal.s16 q5,d3,d4[2] @pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+
+ vmull.s16 q6,d2,d4[1] @ 55 * pi2_src[2]
+ vmlsl.s16 q6,d1,d4[2] @ 55 * pi2_src[2] - 74 * pi2_src[1]
+ vmlsl.s16 q6,d3,d4[0] @ - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ vmlal.s16 q6,d0,d4[3] @pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+ vqrshrn.s32 d14,q3,#shift_stage1_idct @ (pi2_out[0] + rounding ) >> shift_stage1_idct
+ vqrshrn.s32 d15,q4,#shift_stage1_idct @ (pi2_out[1] + rounding ) >> shift_stage1_idct
+ vqrshrn.s32 d16,q5,#shift_stage1_idct @ (pi2_out[2] + rounding ) >> shift_stage1_idct
+ vqrshrn.s32 d17,q6,#shift_stage1_idct @ (pi2_out[3] + rounding ) >> shift_stage1_idct
+ vld1.32 d18[0], [r2],r5
+
+ vtrn.16 d14,d15
+ vtrn.16 d16,d17
+ vtrn.32 d14,d16
+ vtrn.32 d15,d17
+ @ output in d14,d15,d16,d17
+ @ first stage computation ends
+
+ @ second stage computation starts : copy pasting 1st stage
+ @ register changes
+ @ d14 - d0
+ @ d15 - d1
+ @ d16 - d2
+ @ d17 - d3
+ vld1.32 d18[1], [r2],r5
+ vmull.s16 q3,d15,d4[2] @74 * pi2_src[1]
+ vmlal.s16 q3,d14,d4[0] @74 * pi2_src[1] + 29 * pi2_src[0]
+ vmlal.s16 q3,d17,d4[1] @74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+ vmlal.s16 q3,d16,d4[3] @pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+ vmull.s16 q4,d15,d4[2] @74 * pi2_src[1]
+ vmlal.s16 q4,d14,d4[1] @74 * pi2_src[1] + 55 * pi2_src[0]
+ vmlsl.s16 q4,d16,d4[0] @74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ vmlsl.s16 q4,d17,d4[3] @pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+
+ vmull.s16 q5,d14,d4[2] @ 74 * pi2_src[0]
+ vmlsl.s16 q5,d16,d4[2] @ 74 * pi2_src[0] - 74 * pi2_src[2]
+ vmlal.s16 q5,d17,d4[2] @pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+ vld1.32 d19[0], [r2],r5
+
+ vmull.s16 q6,d16,d4[1] @ 55 * pi2_src[2]
+ vmlsl.s16 q6,d15,d4[2] @ - 74 * pi2_src[1] + 55 * pi2_src[2]
+ vmlsl.s16 q6,d17,d4[0] @ - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ vmlal.s16 q6,d14,d4[3] @pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+ vqrshrn.s32 d0,q3,#shift_stage2_idct @ (pi2_out[0] + rounding ) >> shift_stage1_idct
+ vqrshrn.s32 d1,q4,#shift_stage2_idct @ (pi2_out[1] + rounding ) >> shift_stage1_idct
+ vqrshrn.s32 d2,q5,#shift_stage2_idct @ (pi2_out[2] + rounding ) >> shift_stage1_idct
+ vqrshrn.s32 d3,q6,#shift_stage2_idct @ (pi2_out[3] + rounding ) >> shift_stage1_idct
+ vld1.32 d19[1], [r2],r5
+ vtrn.16 d0,d1
+ vtrn.16 d2,d3
+ vtrn.32 d0,d2
+ vtrn.32 d1,d3
+ @ output in d0,d1,d2,d3
+ @ second stage computation ends
+
+ @ loading pred
+
+ vaddw.u8 q0,q0,d18 @ pi2_out(16bit) + pu1_pred(8bit)
+ vqmovun.s16 d0,q0 @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+ vaddw.u8 q1,q1,d19 @ pi2_out(16bit) + pu1_pred(8bit)
+ vqmovun.s16 d1,q1 @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+ @ storing destination
+ vst1.32 {d0[0]},[r3],r6
+ vst1.32 {d0[1]},[r3],r6
+ vst1.32 {d1[0]},[r3],r6
+ vst1.32 {d1[1]},[r3],r6
+
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_itrans_recon_8x8.s b/common/arm/ihevc_itrans_recon_8x8.s
new file mode 100644
index 0000000..440512a
--- /dev/null
+++ b/common/arm/ihevc_itrans_recon_8x8.s
@@ -0,0 +1,934 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ihevc_itrans_recon_8x8_neon.s
+@ *
+@ * @brief
+@ * contains function definitions for single stage inverse transform
+@ *
+@ * @author
+@ * anand s
+@ *
+@ * @par list of functions:
+@ * - ihevc_itrans_recon_8x8()
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@ *******************************************************************************
+@ *
+@ * @brief
+@ * this function performs inverse transform and reconstruction for 8x8
+@ * input block
+@ *
+@ * @par description:
+@ * performs inverse transform and adds the prediction data and clips output
+@ * to 8 bit
+@ *
+@ * @param[in] pi2_src
+@ * input 8x8 coefficients
+@ *
+@ * @param[in] pi2_tmp
+@ * temporary 8x8 buffer for storing inverse
+@ *
+@ * transform
+@ * 1st stage output
+@ *
+@ * @param[in] pu1_pred
+@ * prediction 8x8 block
+@ *
+@ * @param[out] pu1_dst
+@ * output 8x8 block
+@ *
+@ * @param[in] src_strd
+@ * input stride
+@ *
+@ * @param[in] pred_strd
+@ * prediction stride
+@ *
+@ * @param[in] dst_strd
+@ * output stride
+@ *
+@ * @param[in] shift
+@ * output shift
+@ *
+@ * @param[in] zero_cols
+@ * zero columns in pi2_src
+@ *
+@ * @returns void
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@ */
+
+@void ihevc_itrans_recon_8x8(word16 *pi2_src,
+@ word16 *pi2_tmp,
+@ uword8 *pu1_pred,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 pred_strd,
+@ word32 dst_strd,
+@ word32 zero_cols
+@ word32 zero_rows )
+
+@**************variables vs registers*************************
+@ r0 => *pi2_src
+@ r1 => *pi2_tmp
+@ r2 => *pu1_pred
+@ r3 => *pu1_dst
+@ src_strd
+@ pred_strd
+@ dst_strd
+@ zero_cols
+
+
+
+.text
+.align 4
+
+
+
+
+.set width_x_size_x5 , 40
+.set width_x_size_x2 , 32
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+.globl ihevc_itrans_recon_8x8_a9q
+
+.extern g_ai2_ihevc_trans_8_transpose
+
+g_ai2_ihevc_trans_8_transpose_addr:
+.long g_ai2_ihevc_trans_8_transpose - ulbl1 - 8
+
+.type ihevc_itrans_recon_8x8_a9q, %function
+
+ihevc_itrans_recon_8x8_a9q:
+@//register usage.extern - loading and until idct of columns
+@// cosine constants - d0
+@// sine constants - d1
+@// row 0 first half - d2 - y0
+@// row 1 first half - d6 - y1
+@// row 2 first half - d3 - y2
+@// row 3 first half - d7 - y3
+@// row 4 first half - d10 - y4
+@// row 5 first half - d14 - y5
+@// row 6 first half - d11 - y6
+@// row 7 first half - d15 - y7
+
+@// row 0 second half - d4 - y0
+@// row 1 second half - d8 - y1
+@// row 2 second half - d5 - y2
+@// row 3 second half - d9 - y3
+@// row 4 second half - d12 - y4
+@// row 5 second half - d16 - y5
+@// row 6 second half - d13 - y6
+@// row 7 second half - d17 - y7
+
+ @// copy the input pointer to another register
+ @// step 1 : load all constants
+ stmfd sp!,{r4-r12,lr}
+ add sp,sp,#40
+ ldr r8,[sp,#4] @ prediction stride
+ ldr r7,[sp,#8] @ destination stride
+ ldr r6,[sp] @ src stride
+ ldr r12,[sp,#12]
+ ldr r11,[sp,#16]
+ mov r6,r6,lsl #1 @ x sizeof(word16)
+ add r9,r0,r6, lsl #1 @ 2 rows
+
+ add r10,r6,r6, lsl #1 @ 3 rows
+
+ sub r10,r10, #8 @ - 4 cols * sizeof(word16)
+ sub r5,r6, #8 @ src_strd - 4 cols * sizeof(word16)
+
+@ ldr r14,=g_imp4d_cxa8_idct_q15
+ ldr r14,g_ai2_ihevc_trans_8_transpose_addr
+ulbl1:
+ add r14,r14,pc
+ vld1.16 {d0,d1},[r14] @//d0,d1 are used for storing the constant data
+
+ @//step 2 load all the input data
+ @//step 3 operate first 4 colums at a time
+
+ and r11,r11,#0xff
+ and r12,r12,#0xff
+
+ cmp r11,#0xf0
+ bge skip_last4_rows
+
+
+ vld1.16 d2,[r0]!
+ vld1.16 d3,[r9]!
+ vld1.16 d4,[r0],r5
+ vmull.s16 q10,d2,d0[0] @// y0 * cos4(part of c0 and c1)
+ vld1.16 d5,[r9],r5
+ vmull.s16 q9,d3,d1[2] @// y2 * sin2 (q3 is freed by this time)(part of d1)
+ vld1.16 d6,[r0]!
+ vld1.16 d7,[r9]!
+ vmull.s16 q12,d6,d0[1] @// y1 * cos1(part of b0)
+ vld1.16 d8,[r0],r10
+ vmull.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vld1.16 d9,[r9],r10
+ vmull.s16 q14,d6,d1[1] @// y1 * sin3(part of b2)
+ vld1.16 d10,[r0]!
+ vmull.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+ vld1.16 d11,[r9]!
+ vmlal.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vld1.16 d12,[r0],r5
+ vmlsl.s16 q13,d7,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vld1.16 d13,[r9],r5
+ vmlsl.s16 q14,d7,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vld1.16 d14,[r0]!
+ vmlsl.s16 q15,d7,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+ vld1.16 d15,[r9]!
+ vmull.s16 q11,d10,d0[0] @// y4 * cos4(part of c0 and c1)
+ vld1.16 d16,[r0],r10
+ vmull.s16 q3,d3,d0[2] @// y2 * cos2(part of d0)
+ vld1.16 d17,[r9],r10
+
+ @/* this following was activated when alignment is not there */
+@// vld1.16 d2,[r0]!
+@// vld1.16 d3,[r2]!
+@// vld1.16 d4,[r0]!
+@// vld1.16 d5,[r2]!
+@// vld1.16 d6,[r0]!
+@// vld1.16 d7,[r2]!
+@// vld1.16 d8,[r0],r3
+@// vld1.16 d9,[r2],r3
+@// vld1.16 d10,[r0]!
+@// vld1.16 d11,[r2]!
+@// vld1.16 d12,[r0]!
+@// vld1.16 d13,[r2]!
+@// vld1.16 d14,[r0]!
+@// vld1.16 d15,[r2]!
+@// vld1.16 d16,[r0],r3
+@// vld1.16 d17,[r2],r3
+
+
+
+
+ vmlal.s16 q12,d14,d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ vmlsl.s16 q13,d14,d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ vmlal.s16 q14,d14,d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ vmlal.s16 q15,d14,d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ vmlsl.s16 q9,d11,d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ vmlal.s16 q3,d11,d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q5,q10,q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10,q10,q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12,d15,d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+ vmlsl.s16 q13,d15,d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+ vmlal.s16 q14,d15,d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+ vmlsl.s16 q15,d15,d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+ vadd.s32 q7,q5,q3 @// a0 = c0 + d0(part of r0,r7)
+ vsub.s32 q5,q5,q3 @// a3 = c0 - d0(part of r3,r4)
+ vsub.s32 q11,q10,q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9,q10,q9 @// a1 = c1 + d1(part of r1,r6)
+
+ vadd.s32 q10,q7,q12 @// a0 + b0(part of r0)
+ vsub.s32 q3,q7,q12 @// a0 - b0(part of r7)
+
+ vadd.s32 q12,q11,q14 @// a2 + b2(part of r2)
+ vsub.s32 q11,q11,q14 @// a2 - b2(part of r5)
+
+ vadd.s32 q14,q9,q13 @// a1 + b1(part of r1)
+ vsub.s32 q9,q9,q13 @// a1 - b1(part of r6)
+
+ vadd.s32 q13,q5,q15 @// a3 + b3(part of r3)
+ vsub.s32 q15,q5,q15 @// a3 - b3(part of r4)
+
+ vqrshrn.s32 d2,q10,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q3,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d3,q12,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q11,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d6,q14,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d11,q9,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d7,q13,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d10,q15,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+ b last4_cols
+
+
+
+skip_last4_rows:
+
+
+
+ vld1.16 d2,[r0]!
+ vld1.16 d3,[r9]!
+ vld1.16 d4,[r0],r5
+ vld1.16 d5,[r9],r5
+ vld1.16 d6,[r0]!
+ vld1.16 d7,[r9]!
+ vld1.16 d8,[r0],r10
+ vld1.16 d9,[r9],r10
+
+
+
+ vmov.s16 q6,#0
+ vmov.s16 q8,#0
+
+
+
+
+ vmull.s16 q12,d6,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d6,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d7,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d7,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q9,d3,d1[2] @// y2 * sin2 (q3 is freed by this time)(part of d1)
+ vmull.s16 q3,d3,d0[2] @// y2 * cos2(part of d0)
+
+ vmull.s16 q10,d2,d0[0] @// y0 * cos4(part of c0 and c1)
+
+
+ vadd.s32 q7,q10,q3 @// a0 = c0 + d0(part of r0,r7)
+ vsub.s32 q5,q10,q3 @// a3 = c0 - d0(part of r3,r4)
+ vsub.s32 q11,q10,q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9,q10,q9 @// a1 = c1 + d1(part of r1,r6)
+
+ vadd.s32 q10,q7,q12 @// a0 + b0(part of r0)
+ vsub.s32 q3,q7,q12 @// a0 - b0(part of r7)
+
+ vadd.s32 q12,q11,q14 @// a2 + b2(part of r2)
+ vsub.s32 q11,q11,q14 @// a2 - b2(part of r5)
+
+ vadd.s32 q14,q9,q13 @// a1 + b1(part of r1)
+ vsub.s32 q9,q9,q13 @// a1 - b1(part of r6)
+
+ vadd.s32 q13,q5,q15 @// a3 + b3(part of r3)
+ vsub.s32 q15,q5,q15 @// a3 - b3(part of r4)
+
+ vqrshrn.s32 d2,q10,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d15,q3,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d3,q12,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d14,q11,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d6,q14,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d11,q9,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d7,q13,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d10,q15,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+last4_cols:
+
+
+ cmp r12,#0xf0
+ bge skip_last4cols
+
+ vmull.s16 q12,d8,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d8,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d8,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d8,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d9,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d9,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d9,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d9,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q9,d5,d1[2] @// y2 * sin2 (q4 is freed by this time)(part of d1)
+ vmull.s16 q4,d5,d0[2] @// y2 * cos2(part of d0)
+
+ vmull.s16 q10,d4,d0[0] @// y0 * cos4(part of c0 and c1)
+ vmull.s16 q11,d12,d0[0] @// y4 * cos4(part of c0 and c1)
+
+ vmlal.s16 q12,d16,d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ vmlsl.s16 q13,d16,d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ vmlal.s16 q14,d16,d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ vmlal.s16 q15,d16,d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ vmlsl.s16 q9,d13,d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ vmlal.s16 q4,d13,d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q6,q10,q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10,q10,q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12,d17,d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+ vmlsl.s16 q13,d17,d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+ vmlal.s16 q14,d17,d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+ vmlsl.s16 q15,d17,d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+ vadd.s32 q8,q6,q4 @// a0 = c0 + d0(part of e0,e7)
+ vsub.s32 q6,q6,q4 @// a3 = c0 - d0(part of e3,e4)
+ vsub.s32 q11,q10,q9 @// a2 = c1 - d1(part of e2,e5)
+ vadd.s32 q9,q10,q9 @// a1 = c1 + d1(part of e1,e6)
+
+ vadd.s32 q10,q8,q12 @// a0 + b0(part of e0)
+ vsub.s32 q4,q8,q12 @// a0 - b0(part of e7)
+
+ vadd.s32 q12,q11,q14 @// a2 + b2(part of e2)
+ vsub.s32 q11,q11,q14 @// a2 - b2(part of e5)
+
+ vadd.s32 q14,q9,q13 @// a1 + b1(part of e1)
+ vsub.s32 q9,q9,q13 @// a1 - b1(part of e6)
+
+ vadd.s32 q13,q6,q15 @// a3 + b3(part of e3)
+ vsub.s32 q15,q6,q15 @// a3 - b3(part of r4)
+
+ vqrshrn.s32 d4,q10,#shift_stage1_idct @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d17,q4,#shift_stage1_idct @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d5,q12,#shift_stage1_idct @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d16,q11,#shift_stage1_idct @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d8,q14,#shift_stage1_idct @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d13,q9,#shift_stage1_idct @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d9,q13,#shift_stage1_idct @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ vqrshrn.s32 d12,q15,#shift_stage1_idct @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+ b end_skip_last4cols
+
+
+
+skip_last4cols:
+
+
+
+
+
+
+ vtrn.16 q1,q3 @//[r3,r1],[r2,r0] first qudrant transposing
+
+ vtrn.16 q5,q7 @//[r7,r5],[r6,r4] third qudrant transposing
+
+
+ vtrn.32 d6,d7 @//r0,r1,r2,r3 first qudrant transposing continued.....
+ vtrn.32 d2,d3 @//r0,r1,r2,r3 first qudrant transposing continued.....
+
+ vtrn.32 d10,d11 @//r4,r5,r6,r7 third qudrant transposing continued.....
+ vtrn.32 d14,d15 @//r4,r5,r6,r7 third qudrant transposing continued.....
+
+
+ vmull.s16 q12,d6,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d6,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d7,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d7,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q10,d2,d0[0] @// y0 * cos4(part of c0 and c1)
+@ vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
+
+ vmull.s16 q9,d3,d1[2] @// y2 * sin2 (q3 is freed by this time)(part of d1)
+ vmull.s16 q3,d3,d0[2] @// y2 * cos2(part of d0)
+
+
+
+
+ vsub.s32 q11,q10,q3 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q2,q10,q3 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q1,q2,q12
+
+ vsub.s32 q3,q2,q12
+
+ vadd.s32 q4,q11,q15
+
+ vsub.s32 q12,q11,q15
+
+ vqrshrn.s32 d5,q4,#shift_stage2_idct
+ vqrshrn.s32 d2,q1,#shift_stage2_idct
+ vqrshrn.s32 d9,q3,#shift_stage2_idct
+ vqrshrn.s32 d6,q12,#shift_stage2_idct
+
+ vsub.s32 q11,q10,q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9,q10,q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q15,q11,q14
+
+ vsub.s32 q12,q11,q14
+
+ vadd.s32 q14,q9,q13
+
+ vsub.s32 q11,q9,q13
+ vqrshrn.s32 d4,q15,#shift_stage2_idct
+ vqrshrn.s32 d7,q12,#shift_stage2_idct
+ vqrshrn.s32 d3,q14,#shift_stage2_idct
+ vqrshrn.s32 d8,q11,#shift_stage2_idct
+
+
+
+
+
+
+
+
+
+
+ vmull.s16 q12,d14,d0[1] @// y1 * cos1(part of b0)
+
+ vmull.s16 q13,d14,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d14,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d14,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d15,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vtrn.16 d2,d3
+ vmlsl.s16 q13,d15,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vtrn.16 d4,d5
+ vmlsl.s16 q14,d15,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vtrn.16 d6,d7
+ vmlsl.s16 q15,d15,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+ vtrn.16 d8,d9
+ vmull.s16 q10,d10,d0[0] @// y0 * cos4(part of c0 and c1)
+ vtrn.32 d2,d4
+
+ vtrn.32 d3,d5
+ vmull.s16 q9,d11,d1[2] @// y2 * sin2 (q7 is freed by this time)(part of d1)
+ vtrn.32 d6,d8
+ vmull.s16 q7,d11,d0[2] @// y2 * cos2(part of d0)
+ vtrn.32 d7,d9
+
+
+ add r4,r2,r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
+
+
+ add r5,r8,r8, lsl #1 @
+
+
+ add r0,r3,r7, lsl #1 @ r0 points to 3rd row of dest data
+
+
+ add r10,r7,r7, lsl #1 @
+
+
+ vswp d3,d6
+
+
+ vswp d5,d8
+
+
+ vsub.s32 q11,q10,q7 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q6,q10,q7 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q0,q6,q12
+
+
+ vsub.s32 q12,q6,q12
+
+
+ vadd.s32 q6,q11,q15
+
+
+ vsub.s32 q7,q11,q15
+
+ vqrshrn.s32 d10,q0,#shift_stage2_idct
+ vqrshrn.s32 d17,q12,#shift_stage2_idct
+ vqrshrn.s32 d13,q6,#shift_stage2_idct
+ vqrshrn.s32 d14,q7,#shift_stage2_idct
+
+ vsub.s32 q11,q10,q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9,q10,q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q0,q11,q14
+
+
+ vsub.s32 q12,q11,q14
+
+
+ vadd.s32 q14,q9,q13
+
+
+ vsub.s32 q13,q9,q13
+ vld1.8 d18,[r2],r8
+
+ vqrshrn.s32 d12,q0,#shift_stage2_idct
+ vld1.8 d20,[r2],r5
+
+
+ vqrshrn.s32 d15,q12,#shift_stage2_idct
+ vld1.8 d19,[r2],r8
+
+
+
+
+ vqrshrn.s32 d11,q14,#shift_stage2_idct
+ vld1.8 d22,[r4],r8
+
+
+
+
+ vqrshrn.s32 d16,q13,#shift_stage2_idct
+ vld1.8 d21,[r2],r5
+
+
+ b pred_buff_addition
+end_skip_last4cols:
+
+
+
+@/* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
+ vtrn.16 q1,q3 @//[r3,r1],[r2,r0] first qudrant transposing
+ vtrn.16 q2,q4 @//[r3,r1],[r2,r0] second qudrant transposing
+ vtrn.16 q5,q7 @//[r7,r5],[r6,r4] third qudrant transposing
+ vtrn.16 q6,q8 @//[r7,r5],[r6,r4] fourth qudrant transposing
+
+ vtrn.32 d6,d7 @//r0,r1,r2,r3 first qudrant transposing continued.....
+ vtrn.32 d2,d3 @//r0,r1,r2,r3 first qudrant transposing continued.....
+ vtrn.32 d4,d5 @//r0,r1,r2,r3 second qudrant transposing continued.....
+ vtrn.32 d8,d9 @//r0,r1,r2,r3 second qudrant transposing continued.....
+ vtrn.32 d10,d11 @//r4,r5,r6,r7 third qudrant transposing continued.....
+ vtrn.32 d14,d15 @//r4,r5,r6,r7 third qudrant transposing continued.....
+ vtrn.32 d12,d13 @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+ vtrn.32 d16,d17 @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+
+ @//step6 operate on first four rows and find their idct
+ @//register usage.extern - storing and idct of rows
+@// cosine constants - d0
+@// sine constants - d1
+@// element 0 first four - d2 - y0
+@// element 1 first four - d6 - y1
+@// element 2 first four - d3 - y2
+@// element 3 first four - d7 - y3
+@// element 4 first four - d4 - y4
+@// element 5 first four - d8 - y5
+@// element 6 first four - d5 - y6
+@// element 7 first four - d9 - y7
+@// element 0 second four - d10 - y0
+@// element 1 second four - d14 - y1
+@// element 2 second four - d11 - y2
+@// element 3 second four - d15 - y3
+@// element 4 second four - d12 - y4
+@// element 5 second four - d16 - y5
+@// element 6 second four - d13 - y6
+@// element 7 second four - d17 - y7
+
+ @// map between first kernel code seq and current
+@// d2 -> d2
+@// d6 -> d6
+@// d3 -> d3
+@// d7 -> d7
+@// d10 -> d4
+@// d14 -> d8
+@// d11 -> d5
+@// d15 -> d9
+@// q3 -> q3
+@// q5 -> q2
+@// q7 -> q4
+
+ vmull.s16 q12,d6,d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13,d6,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d6,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d6,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d7,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13,d7,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14,d7,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15,d7,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q10,d2,d0[0] @// y0 * cos4(part of c0 and c1)
+ vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
+
+ vmull.s16 q9,d3,d1[2] @// y2 * sin2 (q3 is freed by this time)(part of d1)
+ vmull.s16 q3,d3,d0[2] @// y2 * cos2(part of d0)
+
+
+ vmlal.s16 q12,d8,d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ vmlsl.s16 q13,d8,d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ vmlal.s16 q14,d8,d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ vmlal.s16 q15,d8,d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ vmlsl.s16 q9,d5,d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ vmlal.s16 q3,d5,d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q1,q10,q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10,q10,q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12,d9,d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+ vmlsl.s16 q13,d9,d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+ vmlal.s16 q14,d9,d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+ vmlsl.s16 q15,d9,d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+ vsub.s32 q11,q1,q3 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q2,q1,q3 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q1,q2,q12
+
+ vsub.s32 q3,q2,q12
+
+ vadd.s32 q4,q11,q15
+
+ vsub.s32 q12,q11,q15
+
+ vqrshrn.s32 d5,q4,#shift_stage2_idct
+ vqrshrn.s32 d2,q1,#shift_stage2_idct
+ vqrshrn.s32 d9,q3,#shift_stage2_idct
+ vqrshrn.s32 d6,q12,#shift_stage2_idct
+
+ vsub.s32 q11,q10,q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9,q10,q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q15,q11,q14
+
+ vsub.s32 q12,q11,q14
+
+ vadd.s32 q14,q9,q13
+
+ vsub.s32 q11,q9,q13
+ vqrshrn.s32 d4,q15,#shift_stage2_idct
+ vqrshrn.s32 d7,q12,#shift_stage2_idct
+ vqrshrn.s32 d3,q14,#shift_stage2_idct
+ vqrshrn.s32 d8,q11,#shift_stage2_idct
+
+
+
+
+
+
+
+
+
+
+ vmull.s16 q12,d14,d0[1] @// y1 * cos1(part of b0)
+
+ vmull.s16 q13,d14,d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14,d14,d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15,d14,d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12,d15,d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vtrn.16 d2,d3
+ vmlsl.s16 q13,d15,d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vtrn.16 d4,d5
+ vmlsl.s16 q14,d15,d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vtrn.16 d6,d7
+ vmlsl.s16 q15,d15,d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+ vtrn.16 d8,d9
+ vmull.s16 q10,d10,d0[0] @// y0 * cos4(part of c0 and c1)
+ vtrn.32 d2,d4
+ vmull.s16 q11,d12,d0[0] @// y4 * cos4(part of c0 and c1)
+ vtrn.32 d3,d5
+ vmull.s16 q9,d11,d1[2] @// y2 * sin2 (q7 is freed by this time)(part of d1)
+ vtrn.32 d6,d8
+ vmull.s16 q7,d11,d0[2] @// y2 * cos2(part of d0)
+ vtrn.32 d7,d9
+ vmlal.s16 q12,d16,d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+ add r4,r2,r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
+ vmlsl.s16 q13,d16,d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+ add r5,r8,r8, lsl #1 @
+ vmlal.s16 q14,d16,d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+ add r0,r3,r7, lsl #1 @ r0 points to 3rd row of dest data
+ vmlal.s16 q15,d16,d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ add r10,r7,r7, lsl #1 @
+ vmlsl.s16 q9,d13,d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+ vmlal.s16 q7,d13,d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q6,q10,q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10,q10,q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12,d17,d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+ vswp d3,d6
+ vmlsl.s16 q13,d17,d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+
+ vswp d5,d8
+ vmlal.s16 q14,d17,d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+ vmlsl.s16 q15,d17,d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+ vsub.s32 q11,q6,q7 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q6,q6,q7 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q0,q6,q12
+
+
+ vsub.s32 q12,q6,q12
+
+
+ vadd.s32 q6,q11,q15
+
+
+ vsub.s32 q7,q11,q15
+
+ vqrshrn.s32 d10,q0,#shift_stage2_idct
+ vqrshrn.s32 d17,q12,#shift_stage2_idct
+ vqrshrn.s32 d13,q6,#shift_stage2_idct
+ vqrshrn.s32 d14,q7,#shift_stage2_idct
+
+ vsub.s32 q11,q10,q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9,q10,q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q0,q11,q14
+
+
+ vsub.s32 q12,q11,q14
+
+
+ vadd.s32 q14,q9,q13
+
+
+ vsub.s32 q13,q9,q13
+ vld1.8 d18,[r2],r8
+
+ vqrshrn.s32 d12,q0,#shift_stage2_idct
+ vld1.8 d20,[r2],r5
+
+
+ vqrshrn.s32 d15,q12,#shift_stage2_idct
+ vld1.8 d19,[r2],r8
+
+
+
+
+ vqrshrn.s32 d11,q14,#shift_stage2_idct
+ vld1.8 d22,[r4],r8
+
+
+
+
+ vqrshrn.s32 d16,q13,#shift_stage2_idct
+ vld1.8 d21,[r2],r5
+
+
+
+
+pred_buff_addition:
+
+
+ vtrn.16 d10,d11
+ vld1.8 d24,[r4],r5
+
+ vtrn.16 d12,d13
+ vld1.8 d23,[r4],r8
+
+ vaddw.u8 q1,q1,d18
+ vld1.8 d25,[r4],r5
+
+ vtrn.16 d14,d15
+ vaddw.u8 q2,q2,d22
+
+ vtrn.16 d16,d17
+ vaddw.u8 q3,q3,d20
+
+ vtrn.32 d10,d12
+ vaddw.u8 q4,q4,d24
+
+ vtrn.32 d11,d13
+ vtrn.32 d14,d16
+ vtrn.32 d15,d17
+
+ vswp d11,d14
+ vswp d13,d16
+
+@ row values stored in the q register.
+
+@q1 :r0
+@q3: r1
+@q2: r2
+@q4: r3
+@q5: r4
+@q7: r5
+@q6: r6
+@q8: r7
+
+
+
+@/// adding the prediction buffer
+
+
+
+
+
+
+
+
+
+ @ load prediction data
+
+
+
+
+
+ @adding recon with prediction
+
+
+
+
+
+ vaddw.u8 q5,q5,d19
+ vqmovun.s16 d2,q1
+ vaddw.u8 q7,q7,d21
+ vqmovun.s16 d4,q2
+ vaddw.u8 q6,q6,d23
+ vqmovun.s16 d6,q3
+ vaddw.u8 q8,q8,d25
+ vqmovun.s16 d8,q4
+
+
+
+
+
+
+
+ vst1.8 {d2},[r3],r7
+ vqmovun.s16 d10,q5
+ vst1.8 {d6},[r3],r10
+ vqmovun.s16 d14,q7
+ vst1.8 {d4},[r0],r7
+ vqmovun.s16 d12,q6
+ vst1.8 {d8},[r0],r10
+ vqmovun.s16 d16,q8
+
+
+
+
+
+
+
+ vst1.8 {d10},[r3],r7
+ vst1.8 {d14},[r3],r10
+ vst1.8 {d12},[r0],r7
+ vst1.8 {d16},[r0],r10
+
+
+
+
+ sub sp,sp,#40
+ ldmfd sp!,{r4-r12,pc}
+
+
+
+
+
diff --git a/common/arm/ihevc_mem_fns.s b/common/arm/ihevc_mem_fns.s
new file mode 100644
index 0000000..21b5570
--- /dev/null
+++ b/common/arm/ihevc_mem_fns.s
@@ -0,0 +1,279 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * ,:file
+@ * ihevc_mem_fns_neon.s
+@ *
+@ * ,:brief
+@ * Contains function definitions for memory manipulation
+@ *
+@ * ,:author
+@ * Naveen SR
+@ *
+@ * ,:par List of Functions:
+@ * - ihevc_memcpy()
+@ * - ihevc_memset_mul_8()
+@ * - ihevc_memset_16bit_mul_8()
+@ *
+@ * ,:remarks
+@ * None
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* ,:brief
+@* memcpy of a 1d array
+@*
+@* ,:par Description:
+@* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+@*
+@* ,:param[in] pu1_dst
+@* UWORD8 pointer to the destination
+@*
+@* ,:param[in] pu1_src
+@* UWORD8 pointer to the source
+@*
+@* ,:param[in] num_bytes
+@* number of bytes to copy
+@* ,:returns
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_memcpy_mul_8(UWORD8 *pu1_dst,
+@ UWORD8 *pu1_src,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => *pu1_src
+@ r2 => num_bytes
+
+.text
+.p2align 2
+
+
+
+
+ .global ihevc_memcpy_mul_8_a9q
+.type ihevc_memcpy_mul_8_a9q, %function
+
+ihevc_memcpy_mul_8_a9q:
+
+LOOP_NEON_MEMCPY_MUL_8:
+ @ Memcpy 8 bytes
+ VLD1.8 d0,[r1]!
+ VST1.8 d0,[r0]!
+
+ SUBS r2,r2,#8
+ BNE LOOP_NEON_MEMCPY_MUL_8
+ MOV PC,LR
+
+
+
+@*******************************************************************************
+@*/
+@void ihevc_memcpy(UWORD8 *pu1_dst,
+@ UWORD8 *pu1_src,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => *pu1_src
+@ r2 => num_bytes
+
+
+
+ .global ihevc_memcpy_a9q
+.type ihevc_memcpy_a9q, %function
+
+ihevc_memcpy_a9q:
+ SUBS r2,#8
+ BLT ARM_MEMCPY
+LOOP_NEON_MEMCPY:
+ @ Memcpy 8 bytes
+ VLD1.8 d0,[r1]!
+ VST1.8 d0,[r0]!
+
+ SUBS r2,#8
+ BGE LOOP_NEON_MEMCPY
+ CMP r2,#-8
+ BXEQ LR
+
+ARM_MEMCPY:
+ ADD r2,#8
+
+LOOP_ARM_MEMCPY:
+ LDRB r3,[r1],#1
+ STRB r3,[r0],#1
+ SUBS r2,#1
+ BNE LOOP_ARM_MEMCPY
+ BX LR
+
+
+
+
+@void ihevc_memset_mul_8(UWORD8 *pu1_dst,
+@ UWORD8 value,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => value
+@ r2 => num_bytes
+
+.text
+.p2align 2
+
+
+
+ .global ihevc_memset_mul_8_a9q
+.type ihevc_memset_mul_8_a9q, %function
+
+ihevc_memset_mul_8_a9q:
+
+@ Assumptions: numbytes is either 8, 16 or 32
+ VDUP.8 d0,r1
+LOOP_MEMSET_MUL_8:
+ @ Memset 8 bytes
+ VST1.8 d0,[r0]!
+
+ SUBS r2,r2,#8
+ BNE LOOP_MEMSET_MUL_8
+
+ BX LR
+
+
+
+
+@void ihevc_memset(UWORD8 *pu1_dst,
+@ UWORD8 value,
+@ UWORD8 num_bytes)
+@**************Variables Vs Registers*************************
+@ r0 => *pu1_dst
+@ r1 => value
+@ r2 => num_bytes
+
+
+
+ .global ihevc_memset_a9q
+.type ihevc_memset_a9q, %function
+
+ihevc_memset_a9q:
+ SUBS r2,#8
+ BLT ARM_MEMSET
+ VDUP.8 d0,r1
+LOOP_NEON_MEMSET:
+ @ Memcpy 8 bytes
+ VST1.8 d0,[r0]!
+
+ SUBS r2,#8
+ BGE LOOP_NEON_MEMSET
+ CMP r2,#-8
+ BXEQ LR
+
+ARM_MEMSET:
+ ADD r2,#8
+
+LOOP_ARM_MEMSET:
+ STRB r1,[r0],#1
+ SUBS r2,#1
+ BNE LOOP_ARM_MEMSET
+ BX LR
+
+
+
+
+@void ihevc_memset_16bit_mul_8(UWORD16 *pu2_dst,
+@ UWORD16 value,
+@ UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@ r0 => *pu2_dst
+@ r1 => value
+@ r2 => num_words
+
+.text
+.p2align 2
+
+
+
+ .global ihevc_memset_16bit_mul_8_a9q
+.type ihevc_memset_16bit_mul_8_a9q, %function
+
+ihevc_memset_16bit_mul_8_a9q:
+
+@ Assumptions: num_words is either 8, 16 or 32
+
+ @ Memset 8 words
+ VDUP.16 d0,r1
+LOOP_MEMSET_16BIT_MUL_8:
+ VST1.16 d0,[r0]!
+ VST1.16 d0,[r0]!
+
+ SUBS r2,r2,#8
+ BNE LOOP_MEMSET_16BIT_MUL_8
+
+ BX LR
+
+
+
+
+@void ihevc_memset_16bit(UWORD16 *pu2_dst,
+@ UWORD16 value,
+@ UWORD8 num_words)
+@**************Variables Vs Registers*************************
+@ r0 => *pu2_dst
+@ r1 => value
+@ r2 => num_words
+
+
+
+ .global ihevc_memset_16bit_a9q
+.type ihevc_memset_16bit_a9q, %function
+
+ihevc_memset_16bit_a9q:
+ SUBS r2,#8
+ BLT ARM_MEMSET_16BIT
+ VDUP.16 d0,r1
+LOOP_NEON_MEMSET_16BIT:
+ @ Memset 8 words
+ VST1.16 d0,[r0]!
+ VST1.16 d0,[r0]!
+
+ SUBS r2,#8
+ BGE LOOP_NEON_MEMSET_16BIT
+ CMP r2,#-8
+ BXEQ LR
+
+ARM_MEMSET_16BIT:
+ ADD r2,#8
+
+LOOP_ARM_MEMSET_16BIT:
+ STRH r1,[r0],#2
+ SUBS r2,#1
+ BNE LOOP_ARM_MEMSET_16BIT
+ BX LR
+
+
+
+
+ .section .note.GNU-stack,"",%progbits
+
diff --git a/common/arm/ihevc_padding.s b/common/arm/ihevc_padding.s
new file mode 100644
index 0000000..08d1f36
--- /dev/null
+++ b/common/arm/ihevc_padding.s
@@ -0,0 +1,531 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@ *******************************************************************************
+@ * @file
+@ * ihevc_padding_neon.s
+@ *
+@ * @brief
+@ * contains function definitions padding
+@ *
+@ * @author
+@ * naveen sr
+@ *
+@ * @par list of functions:
+@ * - ihevc_pad_left_luma()
+@ * - ihevc_pad_left_chroma()
+@ *
+@ * @remarks
+@ * none
+@ *
+@ *******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* padding (luma block) at the left of a 2d array
+@*
+@* @par description:
+@* the left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@.if pad_left_luma == c
+@void ihevc_pad_left_luma(uword8 *pu1_src,
+@ word32 src_strd,
+@ word32 ht,
+@ word32 pad_size)
+@**************variables vs registers*************************
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_pad_left_luma_a9q
+
+.type ihevc_pad_left_luma_a9q, %function
+
+ihevc_pad_left_luma_a9q:
+
+ stmfd sp!, {r4-r11,lr} @stack stores the values of the arguments
+
+loop_start_luma_left:
+ @ pad size is assumed to be pad_left = 80
+ sub r4,r0,r3
+
+ ldrb r8,[r0]
+ add r0,r1
+ ldrb r9,[r0]
+ add r0,r1
+ ldrb r10,[r0]
+ add r0,r1
+ ldrb r11,[r0]
+ add r0,r1
+
+ vdup.u8 q0,r8
+ vdup.u8 q1,r9
+ vdup.u8 q2,r10
+ vdup.u8 q3,r11
+
+ add r5,r4,r1
+
+ vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4] @ 16 bytes store
+
+ add r6,r5,r1
+
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store
+
+ add r7,r6,r1
+
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store
+
+ subs r2,#4
+
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+
+ @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+ bne loop_start_luma_left
+
+ ldmfd sp!,{r4-r11,pc} @reload the registers from sp
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* padding (chroma block) at the left of a 2d array
+@*
+@* @par description:
+@* the left column of a 2d array is replicated for pad_size times at the left
+@*
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@.if pad_left_chroma == c
+@void ihevc_pad_left_chroma(uword8 *pu1_src,
+@ word32 src_strd,
+@ word32 ht,
+@ word32 pad_size)
+@{
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+.globl ihevc_pad_left_chroma_a9q
+
+.type ihevc_pad_left_chroma_a9q, %function
+
+ihevc_pad_left_chroma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+loop_start_chroma_left:
+ @ pad size is assumed to be pad_left = 80
+ sub r4,r0,r3
+
+ ldrh r8,[r0]
+ add r0,r1
+ ldrh r9,[r0]
+ add r0,r1
+ ldrh r10,[r0]
+ add r0,r1
+ ldrh r11,[r0]
+ add r0,r1
+
+ vdup.u16 q0,r8
+ vdup.u16 q1,r9
+ vdup.u16 q2,r10
+ vdup.u16 q3,r11
+
+ add r5,r4,r1
+
+ vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4] @ 16 bytes store
+
+ add r6,r5,r1
+
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store
+
+ add r7,r6,r1
+
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store
+
+ subs r2,#4
+
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+
+ @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+ bne loop_start_chroma_left
+
+ ldmfd sp!,{r4-r11,pc} @reload the registers from sp
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* padding (luma block) at the right of a 2d array
+@*
+@* @par description:
+@* the right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@* uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@.if pad_right_luma == c
+@void ihevc_pad_right_luma(uword8 *pu1_src,
+@ word32 src_strd,
+@ word32 ht,
+@ word32 pad_size)
+@{
+@ word32 row@
+@
+@ for(row = 0@ row < ht@ row++)
+@ {
+@ memset(pu1_src, *(pu1_src -1), pad_size)@
+@
+@ pu1_src += src_strd@
+@ }
+@}
+@
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+.globl ihevc_pad_right_luma_a9q
+
+.type ihevc_pad_right_luma_a9q, %function
+
+ihevc_pad_right_luma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+loop_start_luma_right:
+ @ pad size is assumed to be pad_left = 80
+ mov r4,r0
+
+ ldrb r8,[r0, #-1]
+ add r0,r1
+ ldrb r9,[r0, #-1]
+ add r0,r1
+ ldrb r10,[r0, #-1]
+ add r0,r1
+ ldrb r11,[r0, #-1]
+ add r0,r1
+
+ add r5,r4,r1
+ add r6,r5,r1
+ add r7,r6,r1
+
+ vdup.u8 q0,r8
+ vdup.u8 q1,r9
+ vdup.u8 q2,r10
+ vdup.u8 q3,r11
+
+ vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4] @ 16 bytes store
+
+
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store
+
+ subs r2,#4
+
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store
+
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7] @128/8 = 16 bytes store
+
+
+ @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+
+ bne loop_start_luma_right
+
+ ldmfd sp!,{r4-r11,pc} @reload the registers from sp
+
+
+
+
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@@* padding (chroma block) at the right of a 2d array
+@*
+@* @par description:
+@* the right column of a 2d array is replicated for pad_size times at the right
+@*
+@*
+@* @param[in] pu1_src
+@@* uword8 pointer to the source
+@*
+@* @param[in] src_strd
+@* integer source stride
+@*
+@* @param[in] ht
+@@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array (each colour component)
+@*
+@* @param[in] pad_size
+@* integer -padding size of the array
+@*
+@* @param[in] ht
+@@* integer height of the array
+@*
+@* @param[in] wd
+@* integer width of the array
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@.if pad_right_chroma == c
+@void ihevc_pad_right_chroma(uword8 *pu1_src,
+@ word32 src_strd,
+@ word32 ht,
+@ word32 pad_size)
+@ r0 => *pu1_src
+@ r1 => src_strd
+@ r2 => ht
+@ r3 => pad_size
+
+
+
+.globl ihevc_pad_right_chroma_a9q
+
+.type ihevc_pad_right_chroma_a9q, %function
+
+ihevc_pad_right_chroma_a9q:
+
+ stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
+
+loop_start_chroma_right:
+ @ pad size is assumed to be pad_left = 80
+ mov r4,r0
+
+ ldrh r8,[r0, #-2]
+ add r0,r1
+ ldrh r9,[r0, #-2]
+ add r0,r1
+ ldrh r10,[r0, #-2]
+ add r0,r1
+ ldrh r11,[r0, #-2]
+ add r0,r1
+
+ vdup.u16 q0,r8
+ vdup.u16 q1,r9
+ vdup.u16 q2,r10
+ vdup.u16 q3,r11
+
+ add r5,r4,r1
+
+ vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4]! @ 16 bytes store
+ vst1.8 {d0,d1},[r4] @ 16 bytes store
+
+ add r6,r5,r1
+
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store
+ vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store
+
+ add r7,r6,r1
+
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store
+ vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store
+
+ subs r2,#4
+
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store
+ vst1.8 {d6,d7},[r7] @128/8 = 16 bytes store
+
+ @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+ bne loop_start_chroma_right
+
+ ldmfd sp!,{r4-r11,pc} @reload the registers from sp
+
+
+
+
+
+
+
+
diff --git a/common/arm/ihevc_platform_macros.h b/common/arm/ihevc_platform_macros.h
new file mode 100644
index 0000000..72ef0c3
--- /dev/null
+++ b/common/arm/ihevc_platform_macros.h
@@ -0,0 +1,149 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+#ifndef ARMV8
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+ asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+ asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U10(WORD32 x)
+{
+ asm("usat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S10(WORD32 x)
+{
+ asm("ssat %0, #10, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+ asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+ asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+ asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+ asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+
+static __inline UWORD32 ITT_BIG_ENDIAN(UWORD32 x)
+{
+ asm("rev %0, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+#else
+
+#define CLIP_U8(x) CLIP3((x), 0, 255)
+#define CLIP_S8(x) CLIP3((x), -128, 127)
+
+#define CLIP_U10(x) CLIP3((x), 0, 1023);
+#define CLIP_S10(x) CLIP3((x), -512, 511);
+
+#define CLIP_U12(x) CLIP3((x), 0, 4095);
+#define CLIP_S12(x) CLIP3((x), -2048, 2047);
+
+#define CLIP_U16(x) CLIP3((x), 0, 65535)
+#define CLIP_S16(x) CLIP3((x), -32768, 32767)
+
+#define ITT_BIG_ENDIAN(x) ((x & 0x000000ff) << 24) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+#endif
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+
+
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */
diff --git a/common/arm/ihevc_sao_band_offset_chroma.s b/common/arm/ihevc_sao_band_offset_chroma.s
new file mode 100644
index 0000000..32e149d
--- /dev/null
+++ b/common/arm/ihevc_sao_band_offset_chroma.s
@@ -0,0 +1,393 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_band_offset_chroma.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ WORD32 sao_band_pos_u,
+@ WORD32 sao_band_pos_v,
+@ WORD8 *pi1_sao_offset_u,
+@ WORD8 *pi1_sao_offset_v,
+@ WORD32 wd,
+@ WORD32 ht)
+@
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => sao_band_pos_u
+@r6 => sao_band_pos_v
+@r7 => *pi1_sao_offset_u
+@r8 => *pi1_sao_offset_v
+@r9 => wd
+@r10=> ht
+
+.text
+.p2align 2
+
+.extern gu1_table_band_idx
+.globl ihevc_sao_band_offset_chroma_a9q
+
+gu1_table_band_idx_addr_1:
+.long gu1_table_band_idx - ulbl1 - 8
+
+gu1_table_band_idx_addr_2:
+.long gu1_table_band_idx - ulbl2 - 8
+
+ihevc_sao_band_offset_chroma_a9q:
+
+ STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+ LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r10,[sp,#64] @Loads ht
+
+ LDR r9,[sp,#60] @Loads wd
+ MOV r11,r10 @Move the ht to r9 for loop counter
+
+ ADD r12,r0,r9 @pu1_src[row * src_strd + (wd)]
+ LDR r14, gu1_table_band_idx_addr_1
+ulbl1:
+ add r14,r14,pc
+ SUB r12,r12,#2 @wd-2
+
+SRC_LEFT_LOOP:
+ LDRH r5,[r12],r1 @Load the value
+ SUBS r11,r11,#1 @Decrement the loop counter
+ STRH r5,[r2],#2 @Store the value in pu1_src_left pointer
+ BNE SRC_LEFT_LOOP
+
+ LDR r5,[sp,#44] @Loads sao_band_pos_u
+ VLD1.8 D1,[r14]! @band_table_u.val[0]
+ ADD r12,r3,r9 @pu1_src_top[wd]
+
+ LDRH r11,[r12,#-2]
+ VLD1.8 D2,[r14]! @band_table_u.val[1]
+ LSL r6,r5,#3 @sao_band_pos_u
+
+ STRH r11,[r4] @store to pu1_src_top_left[0]
+ VLD1.8 D3,[r14]! @band_table_u.val[2]
+ LDR r7,[sp,#52] @Loads pi1_sao_offset_u
+
+ SUB r4,r10,#1 @ht-1
+ VDUP.8 D31,r6 @band_pos_u
+ MUL r4,r4,r1 @ht-1 * src_strd
+
+ ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd]
+ VLD1.8 D4,[r14]! @band_table_u.val[3]
+ MOV r11,r9 @Move the wd to r9 for loop counter
+
+SRC_TOP_LOOP: @wd is always multiple of 8
+ VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS r11,r11,#8 @Decrement the loop counter by 8
+ VST1.8 D0,[r3]! @Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+
+ VLD1.8 D30,[r7] @pi1_sao_offset_u load
+ VADD.I8 D5,D1,D31 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
+
+ VDUP.8 D29,D30[1] @vdup_n_u8(pi1_sao_offset_u[1])
+ VADD.I8 D6,D2,D31 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
+
+ VDUP.8 D28,D30[2] @vdup_n_u8(pi1_sao_offset_u[2])
+ VADD.I8 D7,D3,D31 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
+
+ VDUP.8 D27,D30[3] @vdup_n_u8(pi1_sao_offset_u[3])
+ VADD.I8 D8,D4,D31 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
+
+ CMP r5,#28
+ VDUP.8 D26,D30[4] @vdup_n_u8(pi1_sao_offset_u[4])
+ LDR r14, gu1_table_band_idx_addr_2
+ulbl2:
+ add r14,r14,pc
+
+ VMOV.I8 D30,#16 @vdup_n_u8(16)
+ VADD.I8 D1,D5,D29 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
+
+ VLD1.8 D9,[r14]! @band_table_v.val[0]
+ VADD.I8 D2,D6,D28 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
+
+ VLD1.8 D10,[r14]! @band_table_v.val[1]
+ VADD.I8 D3,D7,D27 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
+
+ LDR r6,[sp,#48] @Loads sao_band_pos_v
+ VADD.I8 D4,D8,D26 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
+ LSL r11,r6,#3 @sao_band_pos_v
+
+ BLT SAO_BAND_POS_U_0
+
+SAO_BAND_POS_U_28: @case 28
+ VCLE.U8 D13,D4,D30 @vcle_u8(band_table.val[3], vdup_n_u8(16))
+ BNE SAO_BAND_POS_U_29
+
+ VORR.U8 D4,D4,D13 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_U
+
+SAO_BAND_POS_U_29: @case 29
+ CMP r5,#29
+
+ VCLE.U8 D14,D3,D30 @vcle_u8(band_table.val[2], vdup_n_u8(16))
+ BNE SAO_BAND_POS_U_30
+ VORR.U8 D3,D3,D14 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+ VAND.U8 D4,D4,D13 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_U
+
+SAO_BAND_POS_U_30: @case 30
+ CMP r5,#30
+
+ VCLE.U8 D15,D2,D30 @vcle_u8(band_table.val[1], vdup_n_u8(16))
+ BNE SAO_BAND_POS_U_31
+ VORR.U8 D2,D2,D15 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+ VAND.U8 D3,D3,D14 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+
+SAO_BAND_POS_U_31: @case 31
+ CMP r5,#31
+ BNE SWITCH_BREAK_U
+
+ VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16))
+ VORR.U8 D1,D1,D16 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+ VAND.U8 D2,D2,D15 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+ B SWITCH_BREAK_U
+
+SAO_BAND_POS_U_0:
+ CMP r5,#0 @case 0
+ BNE SWITCH_BREAK_U
+
+ VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16))
+ VAND.U8 D1,D1,D16 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_U:
+ VDUP.8 D30,r11 @band_pos_v
+ LDR r8,[sp,#56] @Loads pi1_sao_offset_v
+
+ VLD1.8 D11,[r14]! @band_table_v.val[2]
+ VADD.I8 D13,D9,D30 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
+
+ VLD1.8 D12,[r14]! @band_table_v.val[3]
+ VADD.I8 D14,D10,D30 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
+
+ VLD1.8 D25,[r8] @pi1_sao_offset_v load
+ VADD.I8 D15,D11,D30 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
+
+ VDUP.8 D29,D25[1] @vdup_n_u8(pi1_sao_offset_v[1])
+ VADD.I8 D16,D12,D30 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
+
+ VDUP.8 D28,D25[2] @vdup_n_u8(pi1_sao_offset_v[2])
+ VADD.I8 D9,D13,D29 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
+
+ VDUP.8 D27,D25[3] @vdup_n_u8(pi1_sao_offset_v[3])
+ VADD.I8 D10,D14,D28 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
+
+ VDUP.8 D26,D25[4] @vdup_n_u8(pi1_sao_offset_v[4])
+ VADD.I8 D11,D15,D27 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
+
+ VMOV.I8 D29,#16 @vdup_n_u8(16)
+ VADD.I8 D12,D16,D26 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
+ AND r12,r9,#0xf
+
+ CMP r6,#28
+ BLT SAO_BAND_POS_V_0
+
+SAO_BAND_POS_V_28: @case 28
+ VCLE.U8 D17,D12,D29 @vcle_u8(band_table.val[3], vdup_n_u8(16))
+ BNE SAO_BAND_POS_V_29
+ VORR.U8 D12,D12,D17 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_29: @case 29
+ CMP r6,#29
+
+ VCLE.U8 D18,D11,D29 @vcle_u8(band_table.val[2], vdup_n_u8(16))
+ BNE SAO_BAND_POS_V_30
+ VORR.U8 D11,D11,D18 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+ VAND.U8 D12,D12,D17 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_30: @case 30
+ CMP r6,#30
+
+ VCLE.U8 D19,D10,D29 @vcle_u8(band_table.val[1], vdup_n_u8(16))
+ BNE SAO_BAND_POS_V_31
+ VORR.U8 D10,D10,D19 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+ VAND.U8 D11,D11,D18 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_31: @case 31
+ CMP r6,#31
+ BNE SWITCH_BREAK_V
+
+ VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16))
+ VORR.U8 D9,D9,D20 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+ VAND.U8 D10,D10,D19 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_0:
+ CMP r6,#0 @case 0
+ BNE SWITCH_BREAK_V
+
+ VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16))
+ VAND.U8 D9,D9,D20 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_V:
+ CMP r9,#16
+ MOV r4,r0 @pu1_src_cpy
+ BLT WIDTH_RESIDUE
+
+WIDTH_LOOP: @Width is assigned to be multiple of 16
+ MOV r4,r0 @pu1_src_cpy
+ MOV r11,r10 @move ht
+ ADD r5,r4,r1
+
+HEIGHT_LOOP: @unrolled for 4 rows
+ ADD r6,r5,r1
+ VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy)
+ ADD r7,r6,r1
+
+ VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy)
+ VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy)
+ VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy)
+ VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VST2.8 {D5,D6},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row)
+ VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ SUBS r11,r11,#4 @Decrement the ht loop count by 4
+ VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+
+ VST2.8 {D13,D14},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+ VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+
+ VST2.8 {D17,D18},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+ ADD r4,r6,r1
+ VST2.8 {D21,D22},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row)
+ ADD r5,r4,r1
+
+ BNE HEIGHT_LOOP
+
+ SUB r9,r9,#16 @Decrement the width loop by 16
+ ADD r0,r0,#16
+ CMP r9,#8
+ BGT WIDTH_LOOP
+ BLT END_LOOP
+ MOV r4,r0 @pu1_src_cpy
+
+WIDTH_RESIDUE: @If width is not multiple of 16
+ ADD r5,r4,r1
+ VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy)
+ ADD r6,r5,r1
+
+ ADD r7,r6,r1
+ VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy)
+ VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy)
+ VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy)
+ VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ VZIP.8 D5,D6
+
+ VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ VST1.8 {D5},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row)
+ VZIP.8 D13,D14
+
+ VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ VST1.8 {D13},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row)
+ SUBS r10,r10,#4 @Decrement the ht loop count by 4
+
+ VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ VZIP.8 D17,D18
+
+ VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ VST1.8 {D17},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row)
+ VZIP.8 D21,D22
+
+ ADD r4,r6,r1
+ VST1.8 {D21},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row)
+ ADD r5,r4,r1
+
+ BNE WIDTH_RESIDUE
+
+END_LOOP:
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
diff --git a/common/arm/ihevc_sao_band_offset_luma.s b/common/arm/ihevc_sao_band_offset_luma.s
new file mode 100644
index 0000000..3875377
--- /dev/null
+++ b/common/arm/ihevc_sao_band_offset_luma.s
@@ -0,0 +1,233 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_band_offset_luma.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ WORD32 sao_band_pos,
+@ WORD8 *pi1_sao_offset,
+@ WORD32 wd,
+@ WORD32 ht)
+@
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => sao_band_pos
+@r6 => *pi1_sao_offset
+@r7 => wd
+@r8 => ht
+
+.text
+.p2align 2
+
+.extern gu1_table_band_idx
+.globl ihevc_sao_band_offset_luma_a9q
+
+gu1_table_band_idx_addr:
+.long gu1_table_band_idx - ulbl1 - 8
+
+ihevc_sao_band_offset_luma_a9q:
+
+ STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ LDR r8,[sp,#56] @Loads ht
+ LDR r7,[sp,#52] @Loads wd
+
+ MOV r9,r8 @Move the ht to r9 for loop counter
+ LDR r5,[sp,#44] @Loads sao_band_pos
+ ADD r10,r0,r7 @pu1_src[row * src_strd + (wd)]
+
+ LDR r4,[sp,#40] @Loads pu1_src_top_left
+ SUB r10,r10,#1 @wd-1
+ LDR r14, gu1_table_band_idx_addr
+ulbl1:
+ add r14,r14,pc
+
+SRC_LEFT_LOOP:
+ LDRB r11,[r10],r1 @Load the value
+ SUBS r9,r9,#1 @Decrement the loop counter
+ STRB r11,[r2],#1 @Store the value in pu1_src_left pointer
+ BNE SRC_LEFT_LOOP
+
+ ADD r9,r3,r7 @pu1_src_top[wd]
+ VLD1.8 D1,[r14]! @band_table.val[0]
+ LDR r6,[sp,#48] @Loads pi1_sao_offset
+
+ LSL r11,r5,#3
+ VLD1.8 D2,[r14]! @band_table.val[1]
+
+ LDRB r10,[r9,#-1]
+ VDUP.8 D31,r11 @band_pos
+ SUB r12,r8,#1 @ht-1
+
+ STRB r10,[r4] @store to pu1_src_top_left[0]
+ VLD1.8 D3,[r14]! @band_table.val[2]
+ MUL r12,r12,r1 @ht-1 * src_strd
+
+ ADD r4,r12,r0 @pu1_src[(ht - 1) * src_strd]
+ VLD1.8 D4,[r14]! @band_table.val[3]
+ MOV r9,r7 @Move the wd to r9 for loop counter
+
+SRC_TOP_LOOP: @wd is always multiple of 8
+ VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS r9,r9,#8 @Decrement the loop counter by 8
+ VST1.8 D0,[r3]! @Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+
+ VLD1.8 D30,[r6] @pi1_sao_offset load
+ VADD.I8 D5,D1,D31 @band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
+
+ VDUP.8 D29,D30[1] @vdup_n_u8(pi1_sao_offset[1])
+ VADD.I8 D6,D2,D31 @band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
+
+ VDUP.8 D28,D30[2] @vdup_n_u8(pi1_sao_offset[2])
+ VADD.I8 D7,D3,D31 @band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
+
+ VDUP.8 D27,D30[3] @vdup_n_u8(pi1_sao_offset[3])
+ VADD.I8 D8,D4,D31 @band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
+
+ VDUP.8 D26,D30[4] @vdup_n_u8(pi1_sao_offset[4])
+ VADD.I8 D1,D5,D29 @band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
+
+ VMOV.I8 D29,#16 @vdup_n_u8(16)
+ VADD.I8 D2,D6,D28 @band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
+
+ CMP r5,#28
+ VADD.I8 D3,D7,D27 @band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
+
+ VADD.I8 D4,D8,D26 @band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
+ BLT SAO_BAND_POS_0
+
+SAO_BAND_POS_28: @case 28
+
+ VCLE.U8 D12,D4,D29 @vcle_u8(band_table.val[3], vdup_n_u8(16))
+
+ BNE SAO_BAND_POS_29
+ VORR.U8 D4,D4,D12 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK
+
+SAO_BAND_POS_29: @case 29
+ CMP r5,#29
+ VCLE.U8 D11,D3,D29 @vcle_u8(band_table.val[2], vdup_n_u8(16))
+
+ BNE SAO_BAND_POS_30
+ VORR.U8 D3,D3,D11 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+ VAND.U8 D4,D4,D12 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK
+
+SAO_BAND_POS_30: @case 30
+ CMP r5,#30
+ VCLE.U8 D10,D2,D29 @vcle_u8(band_table.val[1], vdup_n_u8(16))
+
+ BNE SAO_BAND_POS_31
+ VORR.U8 D2,D2,D10 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+ VAND.U8 D3,D3,D11 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+ B SWITCH_BREAK
+
+SAO_BAND_POS_31: @case 31
+ CMP r5,#31
+ BNE SWITCH_BREAK
+
+ VCLE.U8 D9,D1,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16))
+ VORR.U8 D1,D1,D9 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+ VAND.U8 D2,D2,D10 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+
+SAO_BAND_POS_0:
+ CMP r5,#0 @case 0
+ BNE SWITCH_BREAK
+
+ VCLE.U8 D9,D1,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16))
+ VAND.U8 D1,D1,D9 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK:
+ MOV r4,r0 @pu1_src_cpy
+ MOV r11,r8 @move ht
+ ADD r5,r4,r1
+
+HEIGHT_LOOP:
+ ADD r6,r5,r1
+ VLD1.8 D13,[r4] @au1_cur_row = vld1_u8(pu1_src_cpy)
+
+ ADD r10,r6,r1
+ VLD1.8 D15,[r5] @au1_cur_row = vld1_u8(pu1_src_cpy)
+
+ VLD1.8 D17,[r6] @au1_cur_row = vld1_u8(pu1_src_cpy)
+
+ VLD1.8 D19,[r10] @au1_cur_row = vld1_u8(pu1_src_cpy)
+ VSUB.I8 D14,D13,D31 @vsub_u8(au1_cur_row, band_pos)
+
+ VTBX.8 D13,{D1-D4},D14 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ VSUB.I8 D16,D15,D31 @vsub_u8(au1_cur_row, band_pos)
+
+ VTBX.8 D15,{D1-D4},D16 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ VSUB.I8 D18,D17,D31 @vsub_u8(au1_cur_row, band_pos)
+
+ VTBX.8 D17,{D1-D4},D18 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ VSUB.I8 D20,D19,D31 @vsub_u8(au1_cur_row, band_pos)
+
+ VTBX.8 D19,{D1-D4},D20 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ VST1.8 D13,[r4],r1 @vst1_u8(pu1_src_cpy, au1_cur_row)
+
+ VST1.8 D15,[r5] @vst1_u8(pu1_src_cpy, au1_cur_row)
+ SUBS r11,r11,#4 @Decrement the ht loop count by 4
+
+ VST1.8 D17,[r6],r1 @vst1_u8(pu1_src_cpy, au1_cur_row)
+
+ ADD r4,r6,r1
+ VST1.8 D19,[r10] @vst1_u8(pu1_src_cpy, au1_cur_row)
+ ADD r5,r4,r1
+
+ BNE HEIGHT_LOOP
+
+ SUBS r7,r7,#8 @Decrement the width loop by 8
+ ADD r0,r0,#8
+ BNE SWITCH_BREAK
+
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class0.s b/common/arm/ihevc_sao_edge_offset_class0.s
new file mode 100644
index 0000000..a9fe046
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class0.s
@@ -0,0 +1,344 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class0.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset,
+@ WORD32 wd,
+@ WORD32 ht)
+@
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r7 => *pu1_avail
+@r8 => *pi1_sao_offset
+@r9 => wd
+@r10=> ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class0_a9q:
+
+
+ STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+ LDR r9,[sp,#60] @Loads wd
+
+ LDR r4,[sp,#40] @Loads pu1_src_top_left
+ VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2)
+ ADD r11,r3,r9 @pu1_src_top[wd]
+
+ LDR r10,[sp,#64] @Loads ht
+ VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0)
+ LDRB r12,[r11,#-1] @pu1_src_top[wd - 1]
+
+ LDR r7,[sp,#52] @Loads pu1_avail
+ VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ LDR r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+ add r14,r14,pc
+
+ LDR r8,[sp,#56] @Loads pi1_sao_offset
+ VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
+ STRB r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1]
+
+ MOV r6,r0 @pu1_src_org
+ VLD1.8 D10,[r14] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ SUB r4,r10,#1 @(ht - 1)
+
+ MOV r12,r9 @Move wd to r12 for loop count
+ VLD1.8 D11,[r8] @offset_tbl = vld1_s8(pi1_sao_offset)
+ MUL r4,r4,r1 @(ht - 1) * src_strd
+
+ ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd]
+
+SRC_TOP_LOOP: @wd is always multiple of 8
+ VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS r12,r12,#8 @Decrement the loop counter by 8
+ VST1.8 D0,[r3]! @Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+ ADD r6,r6,#15 @pu1_src_org[16 - 1]
+
+ CMP r9,#16 @Compare wd with 16
+ MOV r3,r2 @pu1_src_left backup to reload later
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+ MOV r8,r9 @move wd to r8 for loop count
+
+WIDTH_LOOP_16:
+ CMP r8,r9 @if(col == wd)
+ BNE AU1_MASK_FF @jump to else part
+ LDRB r12,[r7] @pu1_avail[0]
+ VMOV.8 D8[0],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ B SKIP_AU1_MASK_FF @Skip the else part
+
+AU1_MASK_FF:
+ MOV r12,#0xFF @move -1 to r12
+ VMOV.8 D8[0],r12 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+ CMP r8,#16 @If col == 16
+ BNE SKIP_MASKING_IF_NOT16 @If not skip masking
+ LDRB r12,[r7,#1] @pu1_avail[1]
+ VMOV.8 D9[7],r12 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+ MOV r12,r0 @pu1_src_cpy = pu1_src
+ MOV r4,r10 @move ht to r4 for loop count
+
+PU1_SRC_LOOP:
+ LDRB r11,[r2] @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+ VLD1.8 D12,[r12]! @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ VLD1.8 D13,[r12], r1 @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB r12,#8
+ SUB r5,r9,r8 @wd - col
+
+ SUB r14,r10,r4 @ht - row
+ VMOV.8 D15[7],r11 @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ MUL r14,r14,r1 @(ht - row) * src_strd
+
+ VLD1.8 D26,[r12]! @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ VLD1.8 D27,[r12] @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB r12,#8
+ VEXT.8 Q7,Q7,Q6,#15 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ ADD r5,r14,r5 @(ht - row) * src_strd + (wd - col)
+
+ LDRB r11,[r2, #1] @II Iteration load pu1_src_left since ht - row + 1 =1
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ LDRB r14,[r6,r5] @pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+
+ SUB r4,r4,#1
+ VMOV.8 D29[7],r11 @II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ SUB r12,r12,r1 @Decrement the pu1_src pointer by src_strd
+ VSUB.I8 Q10,Q9,Q8 @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ STRB r14,[r2],#1 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ LDRB r11,[r12,#16] @pu1_src_cpy[16]
+ VEXT.8 Q14,Q14,Q13,#15 @II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ SUB r5,r9,r8 @II wd - col
+
+ ADD r12,r12,r1 @Increment the pu1_src pointer by src_strd
+ VMOV.8 D14[0],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ VCGT.U8 Q15,Q13,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ LDRB r11,[r12,#16] @II pu1_src_cpy[16]
+ VEXT.8 Q7,Q6,Q7,#1 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+ SUB r14,r10,r4 @II ht - row
+
+ VCLT.U8 Q0,Q13,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VMOV.8 D28[0],r11 @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ SUB r12,r12,r1 @Decrement the pu1_src pointer by src_strd
+
+ MUL r14,r14,r1 @II (ht - row) * src_strd
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ ADD r5,r14,r5 @II (ht - row) * src_strd + (wd - col)
+
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VEXT.8 Q14,Q13,Q14,#1 @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+ LDRB r14,[r6,r5] @II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+ VSUB.I8 Q11,Q9,Q8 @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUBS r4,r4,#1 @Decrement row by 1
+
+ VADD.I8 Q7,Q1,Q10 @edge_idx = vaddq_s8(const_2, sign_left)
+ STRB r14,[r2],#1 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ VADD.I8 Q7,Q7,Q11 @edge_idx = vaddq_s8(edge_idx, sign_right)
+ VMOVL.U8 Q9,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VSUB.I8 Q10,Q0,Q15 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VTBL.8 D14,{D10},D14 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VCGT.U8 Q15,Q13,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ VCLT.U8 Q0,Q13,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VTBL.8 D15,{D10},D15 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VSUB.I8 Q11,Q0,Q15 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VAND Q7,Q7,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+ VTBL.8 D16,{D11},D14 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q0,D26 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VADD.I8 Q14,Q1,Q10 @II edge_idx = vaddq_s8(const_2, sign_left)
+ VADD.I8 Q14,Q14,Q11 @II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ VADDW.S8 Q9,Q9,D16 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D28,{D10},D28 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VTBL.8 D29,{D10},D29 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VMIN.U16 Q9,Q9,Q3 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VAND Q14,Q14,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask)
+ VTBL.8 D17,{D11},D15 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+ VMOVL.U8 Q7,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VTBL.8 D30,{D11},D28 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VADDW.S8 Q7,Q7,D17 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMAX.S16 Q7,Q7,Q2 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VTBL.8 D31,{D11},D29 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMIN.U16 Q7,Q7,Q3 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D18,Q9 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VADDW.S8 Q0,Q0,D30 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMOVN.I16 D19,Q7 @vmovn_s16(pi2_tmp_cur_row.val[1])
+ VMAX.S16 Q0,Q0,Q2 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMOVL.U8 Q14,D27 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VMIN.U16 Q0,Q0,Q3 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D0,Q0 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+ VADDW.S8 Q14,Q14,D31 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMAX.S16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VST1.8 {D18,D19},[r12],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMIN.U16 Q14,Q14,Q3 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D1,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {D0,D1},[r12],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP @If not equal jump to the inner loop
+
+ ADD r0,r0,#16 @pu1_src += 16
+
+ SUBS r8,r8,#16 @Decrement column by 16
+ CMP r8,#8 @Check whether residue remains
+ MOV r2,r3 @Reload pu1_src_left
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BLT END_LOOPS @Jump to end function
+
+WIDTH_RESIDUE:
+ SUB r6,r6,#15
+ AND r8,r9,#0xF @wd_rem = wd & 0xF
+ CMP r8,#0 @Residue check
+ BEQ END_LOOPS @No Residue jump to end function
+
+ CMP r8,r9 @if(wd_rem == wd)
+ BNE AU1_MASK_FF_RESIDUE @jump to else part
+ LDRB r12,[r7] @pu1_avail[0]
+ VMOV.8 D8[0],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ B SKIP_AU1_MASK_FF_RESIDUE @Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+ MOV r12,#0xFF @move -s to r12
+ VMOV.8 D8[0],r12 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+ LDRB r11,[r7,#1] @pu1_avail[1]
+ SUB r5,r9,#1 @wd - 1
+
+ MOV r4,r10 @move ht to r4 for loop count
+ VMOV.8 D8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ MOV r12,r0 @pu1_src_cpy = pu1_src
+
+PU1_SRC_LOOP_RESIDUE:
+ VLD1.8 D12,[r12]! @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ VLD1.8 D13,[r12] @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB r12,#8
+ LDRB r11,[r2] @load pu1_src_left
+ VMOV.8 D15[7],r11 @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ VEXT.8 Q7,Q7,Q6,#15 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VSUB.I8 Q10,Q9,Q8 @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ LDRB r11,[r12,#16] @pu1_src_cpy[16]
+ VMOV.8 D14[0],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ VEXT.8 Q7,Q6,Q7,#1 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VSUB.I8 Q11,Q9,Q8 @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q12,Q1,Q10 @edge_idx = vaddq_s8(const_2, sign_left)
+ VADD.I8 Q12,Q12,Q11 @edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ VTBL.8 D24,{D10},D24 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D25,{D10},D25 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q12,Q12,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q10,Q11 @sign_left = vnegq_s8(sign_right)
+ VEXT.8 Q10,Q10,Q11,#15 @sign_left = vextq_s8(sign_left, sign_left, 15)
+
+ VTBL.8 D26,{D11},D24 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D26 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q3 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUB r14,r10,r4 @ht - row
+ MUL r14,r14,r1 @(ht - row) * src_strd
+ ADD r11,r14,r5 @(ht - row) * src_strd + (wd - 1)
+ LDRB r14,[r6, r11] @pu1_src_org[(ht - row) * src_strd + (wd - 1)]
+ STRB r14,[r2],#1 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ VST1.8 {D28},[r12],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ SUBS r4,r4,#1 @Decrement row by 1
+ BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop
+
+END_LOOPS:
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class0_chroma.s b/common/arm/ihevc_sao_edge_offset_class0_chroma.s
new file mode 100644
index 0000000..1dd56f6
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class0_chroma.s
@@ -0,0 +1,431 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class0_chroma.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset_u,
+@ WORD8 *pi1_sao_offset_v,
+@ WORD32 wd,
+@
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r7 => *pu1_avail
+@r8 => *pi1_sao_offset_u
+@r5 => *pi1_sao_offset_v
+@r9 => wd
+@r10=> ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_chroma_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class0_chroma_a9q:
+
+
+ STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+ LDR r9,[sp,#64] @Loads wd
+
+ LDR r4,[sp,#40] @Loads pu1_src_top_left
+ ADD r11,r3,r9 @pu1_src_top[wd]
+
+ LDR r10,[sp,#68] @Loads ht
+ VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2)
+ LDRH r12,[r11,#-2] @pu1_src_top[wd - 1]
+
+ LDR r7,[sp,#52] @Loads pu1_avail
+ VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0)
+ STRH r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1]
+
+ LDR r8,[sp,#56] @Loads pi1_sao_offset_u
+ VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ SUB r4,r10,#1 @(ht - 1)
+
+ LDR r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+ add r14,r14,pc
+ VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
+ MUL r4,r4,r1 @(ht - 1) * src_strd
+
+ LDR r5,[sp,#60] @Loads pi1_sao_offset_v
+ VLD1.8 D11,[r8] @offset_tbl = vld1_s8(pi1_sao_offset_u)
+ ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd]
+
+ MOV r6,r0 @pu1_src_org
+ VLD1.8 D10,[r14] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOV r12,r9 @Move wd to r12 for loop count
+
+SRC_TOP_LOOP: @wd is always multiple of 8
+ VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS r12,r12,#8 @Decrement the loop counter by 8
+ VST1.8 D0,[r3]! @Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+ ADD r6,r6,#14 @pu1_src_org[14]
+
+ MOV r3,r2 @pu1_src_left backup to reload later
+ VLD1.8 D0,[r5] @offset_tbl = vld1_s8(pi1_sao_offset_v)
+ CMP r9,#16 @Compare wd with 16
+
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+ MOV r8,r9 @move wd to r8 for loop count
+
+WIDTH_LOOP_16:
+ CMP r8,r9 @if(col == wd)
+ BNE AU1_MASK_FF @jump to else part
+ LDRB r12,[r7] @pu1_avail[0]
+ VMOV.8 D8[0],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ VMOV.8 D8[1],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
+ B SKIP_AU1_MASK_FF @Skip the else part
+
+AU1_MASK_FF:
+ MOV r12,#-1 @move -1 to r12
+ VMOV.16 D8[0],r12 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+ CMP r8,#16 @If col == 16
+ BNE SKIP_MASKING_IF_NOT16 @If not skip masking
+ LDRB r12,[r7,#1] @pu1_avail[1]
+ VMOV.8 D9[6],r12 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
+ VMOV.8 D9[7],r12 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+ MOV r12,r0 @pu1_src_cpy = pu1_src
+ MOV r4,r10 @move ht to r4 for loop count
+
+PU1_SRC_LOOP:
+ LDRH r11,[r2] @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+ VLD1.8 D12,[r12]! @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ VLD1.8 D13,[r12],r1 @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB r12,#8
+ SUB r5,r9,r8 @wd - col
+
+ SUB r14,r10,r4 @ht - row
+ VMOV.16 D15[3],r11 @vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+ MUL r14,r14,r1 @(ht - row) * src_strd
+
+ VLD1.8 D30,[r12]! @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ VLD1.8 D31,[r12] @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB r12,#8
+ VEXT.8 Q7,Q7,Q6,#14 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+ SUB r12,r12,r1
+
+ LDRH r11,[r2,#2] @II load pu1_src_left since ht - row =0
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ ADD r5,r14,r5 @(ht - row) * src_strd + (wd - col)
+
+ VMOV.16 D29[3],r11 @II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ LDRH r14,[r6,r5] @pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+ VSUB.U8 Q10,Q9,Q8 @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB r4,r4,#1
+
+ LDRB r11,[r12,#16] @pu1_src_cpy[16]
+ VEXT.8 Q14,Q14,Q15,#14 @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+
+ VMOV.8 D14[0],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ VCGT.U8 Q13,Q15,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ LDRB r11,[r12,#17] @pu1_src_cpy[17]
+ VCLT.U8 Q12,Q15,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ STRH r14,[r2],#2 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ ADD r12,r12,r1
+ VMOV.8 D14[1],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ LDRB r11,[r12,#16] @II pu1_src_cpy[16]
+
+ VEXT.8 Q7,Q6,Q7,#2 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+ VMOV.8 D28[0],r11 @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+ LDRB r11,[r12,#17] @II pu1_src_cpy[17]
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ SUB r12,r12,r1
+
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VMOV.8 D28[1],r11 @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+
+ VSUB.U8 Q11,Q9,Q8 @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VEXT.8 Q14,Q15,Q14,#2 @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+
+ VADD.U8 Q7,Q1,Q10 @edge_idx = vaddq_s8(const_2, sign_left)
+
+ VADD.U8 Q7,Q7,Q11 @edge_idx = vaddq_s8(edge_idx, sign_right)
+ VTBL.8 D14,{D10},D14 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VSUB.U8 Q10,Q12,Q13 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VCGT.U8 Q13,Q15,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VTBL.8 D15,{D10},D15 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VCLT.U8 Q12,Q15,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ VAND Q7,Q7,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+ VUZP.8 D14,D15
+
+ VSUB.U8 Q11,Q12,Q13 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VTBL.8 D16,{D11},D14 @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ VADD.U8 Q12,Q1,Q10 @II edge_idx = vaddq_s8(const_2, sign_left)
+
+ VMOVL.U8 Q9,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D17,{D0},D15
+ VADD.U8 Q12,Q12,Q11 @II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ VZIP.S8 D16,D17
+ VTBL.8 D24,{D10},D24 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q6,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ VADDW.S8 Q9,Q9,D16 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D25,{D10},D25 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VMAX.S16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VAND Q12,Q12,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask)
+ VMIN.U16 Q9,Q9,Q3 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VUZP.8 D24,D25 @II
+
+ VADDW.S8 Q6,Q6,D17 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VTBL.8 D26,{D11},D24 @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ VMAX.S16 Q6,Q6,Q2 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMIN.U16 Q6,Q6,Q3 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ VTBL.8 D27,{D0},D25 @II
+ VMOVN.I16 D14,Q9 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VMOVN.I16 D15,Q6 @vmovn_s16(pi2_tmp_cur_row.val[1])
+ VZIP.S8 D26,D27 @II
+
+ SUB r5,r9,r8 @II wd - col
+ VMOVL.U8 Q14,D30 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SUB r14,r10,r4 @II ht - row
+
+ MUL r14,r14,r1 @II (ht - row) * src_strd
+ VADDW.S8 Q14,Q14,D26 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ ADD r5,r14,r5 @II (ht - row) * src_strd + (wd - col)
+
+ LDRH r14,[r6,r5] @II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+ VMAX.S16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ STRH r14,[r2],#2 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+ VMIN.U16 Q14,Q14,Q3 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVL.U8 Q15,D31 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ VADDW.S8 Q15,Q15,D27 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VST1.8 {D14,D15},[r12],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ VMAX.S16 Q15,Q15,Q2 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SUBS r4,r4,#1 @Decrement row by 1
+ VMIN.U16 Q15,Q15,Q3 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+ VMOVN.I16 D29,Q15 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {D28,D29},[r12],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP @If not equal jump to the inner loop
+
+ ADD r0,r0,#16 @pu1_src += 16
+
+ SUBS r8,r8,#16 @Decrement column by 16
+ CMP r8,#8 @Check whether residue remains
+ MOV r2,r3 @Reload pu1_src_left
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BLT END_LOOPS @Jump to end function
+
+WIDTH_RESIDUE:
+ SUB r6,r6,#14
+ AND r8,r9,#0xF @wd_rem = wd & 0xF
+ CMP r8,#0 @Residue check
+ BEQ END_LOOPS @No Residue jump to end function
+
+ CMP r8,r9 @if(wd_rem == wd)
+ BNE AU1_MASK_FF_RESIDUE @jump to else part
+ LDRB r12,[r7] @pu1_avail[0]
+ VMOV.8 D8[0],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ VMOV.8 D8[1],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ B SKIP_AU1_MASK_FF_RESIDUE @Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+ MOV r12,#-1 @move -1 to r12
+ VMOV.16 D8[0],r12 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+ LDRB r12,[r7,#1] @pu1_avail[1]
+ VMOV.8 D8[6],r12 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ VMOV.8 D8[7],r12 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+ MOV r12,r0 @pu1_src_cpy = pu1_src
+ MOV r4,r10 @move ht to r4 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+ LDRH r11,[r2] @load pu1_src_left
+ VLD1.8 D12,[r12]! @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ VLD1.8 D13,[r12],r1 @pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB r12,#8
+ SUB r5,r9,#2 @wd - 2
+
+ SUB r14,r10,r4 @(ht - row)
+ VMOV.16 D15[3],r11 @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ LSL r14,r14,#1 @(ht - row) * 2
+
+ VLD1.8 D30,[r12]! @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ VLD1.8 D31,[r12] @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB r12,#8
+ VEXT.8 Q7,Q7,Q6,#14 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ SUB r12,r12,r1
+
+ LDRH r11,[r2,#2] @II load pu1_src_left
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ MUL r14,r14,r1 @(ht - row) * 2 * src_strd
+
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VMOV.16 D29[3],r11 @II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+
+ LDRB r11,[r12,#16] @pu1_src_cpy[16]
+ VSUB.U8 Q10,Q9,Q8 @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ ADD r5,r14,r5 @(ht - row) * 2 * src_strd + (wd - 2)
+
+ VMOV.8 D14[0],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ VEXT.8 Q14,Q14,Q15,#14 @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+ LDRB r11,[r12,#17] @pu1_src_cpy[17]
+ VCGT.U8 Q13,Q15,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ LDRH r14,[r6, r5] @pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)]
+
+ VMOV.8 D14[1],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ VCLT.U8 Q12,Q15,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ ADD r12,r12,r1
+
+ STRH r14,[r2],#2 @pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+ VEXT.8 Q7,Q6,Q7,#2 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+ LDRB r11,[r12,#16] @II pu1_src_cpy[16]
+
+ VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VMOV.8 D28[0],r11 @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+ LDRB r11,[r12,#17] @II pu1_src_cpy[17]
+ VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ SUB r4,r4,#1 @II Decrement row by 1
+
+ VSUB.U8 Q11,Q9,Q8 @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VMOV.8 D28[1],r11 @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ SUB r12,r12,r1
+
+ VADD.U8 Q7,Q1,Q10 @edge_idx = vaddq_s8(const_2, sign_left)
+ VEXT.8 Q14,Q15,Q14,#2 @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+ VADD.U8 Q7,Q7,Q11 @edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ VSUB.U8 Q10,Q12,Q13 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VTBL.8 D14,{D10},D14 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VCGT.U8 Q13,Q15,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ VCLT.U8 Q12,Q15,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ VTBL.8 D15,{D10},D15 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VSUB.U8 Q11,Q12,Q13 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VAND Q7,Q7,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+ VUZP.8 D14,D15
+
+ VADD.U8 Q14,Q1,Q10 @II edge_idx = vaddq_s8(const_2, sign_left)
+ VTBL.8 D16,{D11},D14 @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ VADD.U8 Q14,Q14,Q11 @II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ VMOVL.U8 Q9,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D17,{D0},D15
+ VMOVL.U8 Q12,D30 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VZIP.S8 D16,D17
+ VTBL.8 D28,{D10},D28 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VADDW.S8 Q9,Q9,D16 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VTBL.8 D29,{D10},D29 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VMIN.U16 Q9,Q9,Q3 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D18,Q9 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VAND Q14,Q14,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ SUB r5,r9,#2 @II wd - 2
+ VUZP.8 D28,D29 @II
+ SUB r14,r10,r4 @II (ht - row)
+
+ LSL r14,r14,#1 @II (ht - row) * 2
+ VTBL.8 D26,{D11},D28 @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ MUL r14,r14,r1 @II (ht - row) * 2 * src_strd
+
+ ADD r5,r14,r5 @II (ht - row) * 2 * src_strd + (wd - 2)
+ VTBL.8 D27,{D0},D29 @II
+ LDRH r14,[r6, r5] @II pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)]
+
+ VZIP.S8 D26,D27 @II
+ VST1.8 {D18},[r12],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ STRH r14,[r2],#2 @II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+ VADDW.S8 Q12,Q12,D26 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SUBS r4,r4,#1 @Decrement row by 1
+
+ VMAX.S16 Q12,Q12,Q2 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q12,Q12,Q3 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D28,Q12 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VST1.8 {D28},[r12],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop
+
+END_LOOPS:
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class1.s b/common/arm/ihevc_sao_edge_offset_class1.s
new file mode 100644
index 0000000..aa1337f
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class1.s
@@ -0,0 +1,371 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class1.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset,
+@ WORD32 wd,
+@ WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => *pu1_avail
+@r6 => *pi1_sao_offset
+@r7 => wd
+@r8 => ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class1_a9q:
+
+
+ STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+ LDR r7,[sp,#60] @Loads wd
+ LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r5,[sp,#52] @Loads pu1_avail
+ LDR r6,[sp,#56] @Loads pi1_sao_offset
+ LDR r8,[sp,#64] @Loads ht
+
+ SUB r9,r7,#1 @wd - 1
+ LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
+ STRB r10,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1]
+ ADD r10,r0,r9 @pu1_src[row * src_strd + wd - 1]
+ MOV r11,r2 @Move pu1_src_left pointer to r11
+ MOV r12,r8 @Move ht to r12 for loop count
+SRC_LEFT_LOOP:
+ LDRB r14,[r10],r1 @Load pu1_src[row * src_strd + wd - 1]
+ STRB r14,[r11],#1 @pu1_src_left[row]
+ SUBS r12,#1 @Decrement the loop count
+ BNE SRC_LEFT_LOOP @If not equal to 0 jump to the src_left_loop
+
+ SUB r12,r8,#1 @ht - 1
+ MUL r12,r12,r1 @(ht - 1) * src_strd
+ ADD r12,r12,r0 @pu1_src[(ht - 1) * src_strd]
+
+ LDRB r4,[r5,#2] @pu1_avail[2]
+ CMP r4,#0 @0 == pu1_avail[2]
+ ADDEQ r0,r0,r1 @pu1_src += src_strd
+ SUBEQ r8,r8,#1 @ht--
+
+ LDRB r4,[r5,#3] @pu1_avail[3]
+ CMP r4,#0 @0 == pu1_avail[3]
+ SUBEQ r8,r8,#1 @ht--
+
+ VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2)
+ VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
+ VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ LDR r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+ add r14,r14,pc
+ VLD1.8 D6,[r14] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset)
+
+ CMP r7,#16 @Compare wd with 16
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+ LDRB r4,[r5,#2] @pu1_avail[2]
+ CMP r4,#0 @0 == pu1_avail[2]
+ SUBEQ r9,r0,r1 @pu1_src -= src_strd
+ MOVNE r9,r3 @*pu1_src_top
+
+ MOV r10,r0 @*pu1_src
+
+ VLD1.8 D8,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D9,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D10,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D11,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+
+ VLD1.8 D30,[r12]! @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+ VLD1.8 D31,[r12]! @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+ VCGT.U8 Q6,Q5,Q4 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ VST1.8 {Q15},[r3]! @vst1q_u8(pu1_src_top[col])
+ VCLT.U8 Q7,Q5,Q4 @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ VSUB.U8 Q8,Q7,Q6 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r11,r8 @move ht to r11 for loop count
+
+PU1_SRC_LOOP:
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ ADD r6,r10,r1 @II Iteration *pu1_src + src_strd
+
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VLD1.8 D30,[r6]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r6] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r6,#8
+
+ VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB r10,r10,r1
+
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VMOVL.U8 Q13,D18 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VADD.I8 Q6,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VMOVL.U8 Q14,D19 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ VADD.I8 Q6,Q6,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VCGT.U8 Q11,Q9,Q15 @II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ VNEG.S8 Q8,Q10 @sign_up = vnegq_s8(sign_down)
+ VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VCLT.U8 Q12,Q9,Q15 @II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ VSUB.U8 Q4,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VTBL.8 D13,{D6},D13 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VADD.I8 Q11,Q0,Q8 @II edge_idx = vaddq_s8(const_2, sign_up)
+
+
+ VNEG.S8 Q8,Q4 @II sign_up = vnegq_s8(sign_down)
+ VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VADD.I8 Q11,Q11,Q4 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+
+ VMOVL.U8 Q10,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VADDW.S8 Q10,Q10,D12 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VTBL.8 D23,{D6},D23 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+
+ VMOVL.U8 Q4,D11 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VTBL.8 D13,{D7},D13 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMOV Q5,Q15 @II pu1_cur_row = pu1_next_row
+
+ VADDW.S8 Q4,Q4,D13 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q4,Q4,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMIN.U16 Q4,Q4,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ VTBL.8 D25,{D7},D23 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+ VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMOVN.I16 D21,Q4 @vmovn_s16(pi2_tmp_cur_row.val[1])
+ VADDW.S8 Q14,Q14,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ VST1.8 {Q10},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ VMOVN.I16 D30,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUBS r11,r11,#2 @II Decrement the ht loop count by 1
+ VMOVN.I16 D31,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {Q15},[r10],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ PU1_SRC_LOOP_END @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+ CMP r11,#1 @checking any residue remains
+ BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP
+
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB r10,r10,r1
+
+ VADD.I8 Q11,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q11,Q11,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D23,{D6},D23 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VTBL.8 D24,{D7},D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q13,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q13,Q13,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VTBL.8 D25,{D7},D23 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMOVL.U8 Q14,D11 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D30,Q13 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VMOVN.I16 D31,Q14 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {Q15},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+ VMOV Q5,Q9 @pu1_cur_row = pu1_next_row
+ SUBS r7,r7,#16 @Decrement the wd loop count by 16
+ CMP r7,#8 @Check whether residue remains
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BLT END_LOOPS @Jump to end function
+
+
+WIDTH_RESIDUE:
+ LDRB r4,[r5,#2] @pu1_avail[2]
+ CMP r4,#0 @0 == pu1_avail[2]
+ SUBEQ r9,r0,r1 @pu1_src -= src_strd
+ MOVNE r9,r3 @*pu1_src_top
+ MOV r10,r0
+
+ VLD1.8 D8,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D9,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D10,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D11,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+
+ VLD1.8 D30,[r12] @vld1_u8(pu1_src[(ht - 1) * src_strd])
+ VST1.8 {D30},[r3] @vst1_u8(pu1_src_top[col])
+
+ VCGT.U8 Q6,Q5,Q4 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q7,Q5,Q4 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q8,Q7,Q6 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r11,r8 @move ht to r11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ ADD r6,r10,r1 @II Iteration *pu1_src + src_strd
+
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row)
+ VLD1.8 D30,[r6]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r6] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r6,#8
+
+ VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB r10,r10,r1
+
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VMOVL.U8 Q13,D18 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VADD.I8 Q6,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VCGT.U8 Q11,Q9,Q15 @II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+ VADD.I8 Q6,Q6,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VCLT.U8 Q12,Q9,Q15 @II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+ VNEG.S8 Q8,Q10 @sign_up = vnegq_s8(sign_down)
+ VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VSUB.U8 Q10,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q11,Q0,Q8 @II edge_idx = vaddq_s8(const_2, sign_up)
+ VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q8,Q10 @II sign_up = vnegq_s8(sign_down)
+
+ VADD.I8 Q11,Q11,Q10 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+ VMOVL.U8 Q10,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VADDW.S8 Q10,Q10,D12 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOV Q5,Q15 @II pu1_cur_row = pu1_next_row
+ VST1.8 {D20},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMOVN.I16 D30,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUBS r11,r11,#2 @Decrement the ht loop count by 1
+ VST1.8 {D30},[r10],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ END_LOOPS
+ CMP r11,#1
+ BGT PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
+
+
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row)
+ VCGT.U8 Q7,Q9,Q5 @vcltq_u8(pu1_cur_row, pu1_next_row)
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB r10,r10,r1
+
+ VADD.I8 Q11,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q11,Q11,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ VTBL.8 D24,{D7},D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q13,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q13,Q13,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D30,Q13 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class1_chroma.s b/common/arm/ihevc_sao_edge_offset_class1_chroma.s
new file mode 100644
index 0000000..09d925f
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class1_chroma.s
@@ -0,0 +1,407 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class1_chroma.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class1_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset_u,
+@ WORD8 *pi1_sao_offset_v,
+@ WORD32 wd,
+@ WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => *pu1_avail
+@r6 => *pi1_sao_offset_u
+@r7 => *pi1_sao_offset_v
+@r8 => wd
+@r9 => ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_chroma_a9q
+
+gi1_table_edge_idx_addr:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+ihevc_sao_edge_offset_class1_chroma_a9q:
+
+
+ STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments
+ LDR r7,[sp,#60] @Loads wd
+ LDR r4,[sp,#40] @Loads pu1_src_top_left
+ LDR r5,[sp,#52] @Loads pu1_avail
+ LDR r6,[sp,#56] @Loads pi1_sao_offset_u
+ LDR r7,[sp,#60] @Loads pi1_sao_offset_v
+ LDR r8,[sp,#64] @Loads wd
+ LDR r9,[sp,#68] @Loads ht
+
+ SUB r10,r8,#2 @wd - 2
+ LDRH r11,[r3,r10] @pu1_src_top[wd - 2]
+ STRH r11,[r4] @*pu1_src_top_left = pu1_src_top[wd - 2]
+ ADD r11,r0,r10 @pu1_src[row * src_strd + wd - 2]
+ MOV r12,r2 @Move pu1_src_left pointer to r11
+ MOV r14,r9 @Move ht to r14 for loop count
+SRC_LEFT_LOOP:
+ LDRH r10,[r11],r1 @Load pu1_src[row * src_strd + wd - 2]
+ STRH r10,[r12],#2 @pu1_src_left[row]
+ SUBS r14,#1 @Decrement the loop count
+ BNE SRC_LEFT_LOOP @If not equal to 0 jump to the src_left_loop
+
+ SUB r12,r9,#1 @ht - 1
+ MUL r12,r12,r1 @(ht - 1) * src_strd
+ ADD r12,r12,r0 @pu1_src[(ht - 1) * src_strd]
+
+ LDRB r4,[r5,#2] @pu1_avail[2]
+ CMP r4,#0 @0 == pu1_avail[2]
+ ADDEQ r0,r0,r1 @pu1_src += src_strd
+ SUBEQ r9,r9,#1 @ht--
+
+ LDRB r4,[r5,#3] @pu1_avail[3]
+ CMP r4,#0 @0 == pu1_avail[3]
+ SUBEQ r9,r9,#1 @ht--
+
+ VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2)
+ VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
+ VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ LDR r14, gi1_table_edge_idx_addr @table pointer
+ulbl1:
+ add r14,r14,pc
+ VLD1.8 D6,[r14] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VLD1.8 D7,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+ VLD1.8 D8,[r7] @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+
+ CMP r8,#16 @Compare wd with 16
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+ LDRB r4,[r5,#2] @pu1_avail[2]
+ CMP r4,#0 @0 == pu1_avail[2]
+ SUBEQ r11,r0,r1 @pu1_src -= src_strd
+ MOVNE r11,r3 @*pu1_src_top
+
+ MOV r10,r0 @*pu1_src
+
+ VLD1.8 D28,[r11]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D29,[r11]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D10,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D11,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+
+ VLD1.8 D30,[r12]! @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+ VLD1.8 D31,[r12]! @vld1q_u8(pu1_src[(ht - 1) * src_strd])
+ VCGT.U8 Q6,Q5,Q14 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ VST1.8 {Q15},[r3]! @vst1q_u8(pu1_src_top[col])
+ VCLT.U8 Q7,Q5,Q14 @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ VSUB.U8 Q8,Q7,Q6 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r11,r9 @move ht to r11 for loop count
+
+PU1_SRC_LOOP:
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ ADD r6,r10,r1 @II Iteration *pu1_src + src_strd
+
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VLD1.8 D30,[r6]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r6] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r6,#8
+
+ VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB r10,r10,r1
+
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VMOVL.U8 Q13,D18 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VADD.I8 Q6,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VMOVL.U8 Q14,D19 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ VADD.I8 Q6,Q6,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VCGT.U8 Q11,Q9,Q15 @II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ VNEG.S8 Q8,Q10 @sign_up = vnegq_s8(sign_down)
+ VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VCLT.U8 Q12,Q9,Q15 @II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ VSUB.U8 Q14,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VTBL.8 D13,{D6},D13 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VADD.I8 Q11,Q0,Q8 @II edge_idx = vaddq_s8(const_2, sign_up)
+
+
+ VUZP.8 D12,D13
+ VNEG.S8 Q8,Q14 @II sign_up = vnegq_s8(sign_down)
+ VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VADD.I8 Q11,Q11,Q14 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VMOVL.U8 Q10,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D13,{D8},D13
+ VZIP.8 D12,D13
+
+ VADDW.S8 Q10,Q10,D12 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VTBL.8 D23,{D6},D23 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VUZP.8 D22,D23
+
+ VMOVL.U8 Q14,D11 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ @VTBL.8 D13,D7,D13 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMOV Q5,Q15 @II pu1_cur_row = pu1_next_row
+
+ VADDW.S8 Q14,Q14,D13 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VTBL.8 D24,{D7},D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VTBL.8 D25,{D8},D23
+ VZIP.8 D24,D25
+ @VTBL.8 D24,D7,D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ @VTBL.8 D25,D7,D23 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+ VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMOVN.I16 D21,Q14 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VMOVL.U8 Q14,D19 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ VST1.8 {Q10},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ VMOVN.I16 D30,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUBS r11,r11,#2 @II Decrement the ht loop count by 1
+ VMOVN.I16 D31,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {Q15},[r10],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ PU1_SRC_LOOP_END @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+ CMP r11,#1 @checking any residue remains
+ BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP
+
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB r10,r10,r1
+
+ VADD.I8 Q11,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q11,Q11,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D23,{D6},D23 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VUZP.8 D22,D23
+ VTBL.8 D24,{D7},D22
+ VTBL.8 D25,{D8},D23
+ VZIP.8 D24,D25
+
+ @VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q13,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q13,Q13,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ @VTBL.8 D25,D7,D23 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMOVL.U8 Q14,D11 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D30,Q13 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VMOVN.I16 D31,Q14 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {Q15},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+ VMOV Q5,Q9 @pu1_cur_row = pu1_next_row
+ SUBS r8,r8,#16 @Decrement the wd loop count by 16
+ CMP r8,#8 @Check whether residue remains
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BLT END_LOOPS @Jump to end function
+
+
+WIDTH_RESIDUE:
+ LDRB r4,[r5,#2] @pu1_avail[2]
+ CMP r4,#0 @0 == pu1_avail[2]
+ SUBEQ r11,r0,r1 @pu1_src -= src_strd
+ MOVNE r11,r3 @*pu1_src_top
+ MOV r10,r0
+
+ VLD1.8 D28,[r11]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D29,[r11]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ VLD1.8 D10,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D11,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+
+ VLD1.8 D30,[r12] @vld1_u8(pu1_src[(ht - 1) * src_strd])
+ VST1.8 {D30},[r3] @vst1_u8(pu1_src_top[col])
+
+ VCGT.U8 Q6,Q5,Q14 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q7,Q5,Q14 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q8,Q7,Q6 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r11,r9 @move ht to r11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ ADD r6,r10,r1 @II Iteration *pu1_src + src_strd
+
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row)
+ VLD1.8 D30,[r6]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r6] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r6,#8
+
+ VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB r10,r10,r1
+
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VMOVL.U8 Q13,D18 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VADD.I8 Q6,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VCGT.U8 Q11,Q9,Q15 @II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+ VADD.I8 Q6,Q6,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VCLT.U8 Q12,Q9,Q15 @II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+ VNEG.S8 Q8,Q10 @sign_up = vnegq_s8(sign_down)
+ VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VSUB.U8 Q10,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VUZP.8 D12,D13
+
+ VADD.I8 Q11,Q0,Q8 @II edge_idx = vaddq_s8(const_2, sign_up)
+ VTBL.8 D12,{D7},D12
+ VNEG.S8 Q8,Q10 @II sign_up = vnegq_s8(sign_down)
+
+ VTBL.8 D13,{D8},D13
+ VZIP.8 D12,D13
+
+ @VTBL.8 D12,D7,D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ VADD.I8 Q11,Q11,Q10 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+ VMOVL.U8 Q10,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VADDW.S8 Q10,Q10,D12 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VUZP.8 D22,D23
+
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VTBL.8 D24,{D7},D22
+ VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VTBL.8 D25,{D8},D23
+ VZIP.8 D24,D25
+ @VTBL.8 D24,D7,D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOV Q5,Q15 @II pu1_cur_row = pu1_next_row
+ VST1.8 {D20},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMOVN.I16 D30,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUBS r11,r11,#2 @Decrement the ht loop count by 1
+ VST1.8 {D30},[r10],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ END_LOOPS
+ CMP r11,#1
+ BGT PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
+
+
+ ADD r10,r10,r1 @*pu1_src + src_strd
+ VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r10,#8
+ VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row)
+ VCGT.U8 Q7,Q9,Q5 @vcltq_u8(pu1_cur_row, pu1_next_row)
+ VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB r10,r10,r1
+
+ VADD.I8 Q11,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q11,Q11,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ VUZP.8 D22,D23
+ VTBL.8 D24,{D7},D22
+ VTBL.8 D25,{D8},D23
+ VZIP.8 D24,D25
+
+ @VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q13,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q13,Q13,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D30,Q13 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class2.s b/common/arm/ihevc_sao_edge_offset_class2.s
new file mode 100644
index 0000000..33b4961
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class2.s
@@ -0,0 +1,811 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class2.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset,
+@ WORD32 wd,
+@ WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => *pu1_avail
+@r6 => *pi1_sao_offset
+@r7 => wd
+@r8=> ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+ihevc_sao_edge_offset_class2_a9q:
+
+
+ STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+ LDR r7,[sp,#0x3C] @Loads wd
+
+ LDR r8,[sp,#0x40] @Loads ht
+ SUB r9,r7,#1 @wd - 1
+
+ LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
+
+ STR r0,[sp,#0x2C] @Store pu1_src in sp
+ MOV r9,r7 @Move width to r9 for loop count
+
+ STR r2,[sp,#0x30] @Store pu1_src_left in sp
+ LDR r5,[sp,#0x34] @Loads pu1_avail
+ LDR r6,[sp,#0x38] @Loads pi1_sao_offset
+ STR r3,[sp,#0x38] @Store pu1_src_top in sp
+
+ SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values
+
+ STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1]
+ SUB r10,r8,#1 @ht-1
+ MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
+ ADD r12,sp,#0x02 @temp array
+
+AU1_SRC_TOP_LOOP:
+ VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
+ SUBS r9,r9,#8 @Decrement the loop count by 8
+ VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP:
+ LDRB r10,[r5,#4] @pu1_avail[4]
+ CMP r10,#0
+ LDRB r9,[r0] @u1_pos_0_0_tmp = pu1_src[0]
+ BEQ PU1_AVAIL_7_LOOP
+
+ LDRB r11,[r4] @pu1_src_top_left[0]
+ ADD r14,r0,r1 @pu1_src + src_strd
+
+ SUBS r12,r9,r11 @pu1_src[0] - pu1_src_top_left[0]
+ LDRB r4,[r14,#1] @pu1_src[1 + src_strd]
+
+ MVNLT r12,#0
+ MOVGT r12,#1 @SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+ LDR r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+ add r14,r14,pc
+ SUBS r11,r9,r4 @pu1_src[0] - pu1_src[1 + src_strd]
+
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+ ADD r4,r12,r11 @SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+ ADD r4,r4,#2 @edge_idx
+
+ LDRSB r12,[r14,r4] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0 @0 != edge_idx
+ BEQ PU1_AVAIL_7_LOOP
+ LDRSB r10,[r6,r12] @pi1_sao_offset[edge_idx]
+ ADD r9,r9,r10 @pu1_src[0] + pi1_sao_offset[edge_idx]
+ USAT r9,#8,r9 @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP:
+ LDRB r14,[r5,#7] @pu1_avail[7]
+ CMP r14,#0
+ SUB r10,r7,#1 @wd - 1
+ SUB r11,r8,#1 @ht - 1
+ MLA r12,r11,r1,r10 @wd - 1 + (ht - 1) * src_strd
+ ADD r12,r12,r0 @pu1_src[wd - 1 + (ht - 1) * src_strd]
+ LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
+ BEQ PU1_AVAIL
+
+ SUB r4,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+ LDRB r11,[r4,#-1] @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
+ ADD r14,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+
+ SUBS r11,r10,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
+ LDRB r4,[r14,#1] @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
+
+ SUBS r4,r10,r4 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+ MVNLT r4,#0
+ MOVGT r4,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+ ADD r11,r11,r4 @Add 2 sign value
+ ADD r11,r11,#2 @edge_idx
+ LDR r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+ add r14,r14,pc
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0
+ BEQ PU1_AVAIL
+ LDRSB r11,[r6,r12] @pi1_sao_offset[edge_idx]
+ ADD r10,r10,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL:
+ MOV r12,r8 @Move ht
+ VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2)
+ LDRB r11,[r5,#3] @pu1_avail[3]
+
+ MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
+ VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
+ CMP r11,#0
+
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ SUBEQ r12,r12,#1 @ht_tmp--
+
+ CMP r5,#0
+ VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset)
+ LDR r11, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+ add r11,r11,pc
+
+ ADDEQ r0,r0,r1 @pu1_src += src_strd
+ VLD1.8 D6,[r11] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ SUBEQ r12,r12,#1 @ht_tmp--
+
+ MOV r6,r7 @move wd to r6 loop_count
+ VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
+ ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1
+
+ STR r0,[sp,#0x90] @Store pu1_src in sp
+ CMP r7,#16 @Compare wd with 16
+
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP r8,#4 @Compare ht with 4
+ BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ LDR r7,[sp,#0xD0] @Loads wd
+
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ CMP r6,r7 @col == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+ MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ CMP r6,#16 @if(col == 16)
+ BNE SKIP_AU1_MASK_VAL
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ LDRB r11,[r5,#2] @pu1_avail[2]
+ CMP r11,#0
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ MOVNE r8,r3 @pu1_src_top_cpy
+ SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+ LDR r7,[sp,#0xD0] @Loads wd
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ SUB r8,#8
+ ADD r3,r3,#16
+
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+ LDR r4,[sp,#0xD4] @Loads ht
+
+ SUB r7,r7,r6 @(wd - col)
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ LDR r8,[sp,#0xC0] @Loads *pu1_src
+
+ ADD r7,r7,#15 @15 + (wd - col)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+ SUB r5,r5,#1
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP:
+ LDRB r8,[r7],r1 @load the value and increment by src_strd
+ STRB r8,[r5,#1]! @store it in the stack pointer
+ SUBS r4,r4,#1 @decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP
+
+ ADD r8,r0,r1 @I Iteration *pu1_src + src_strd
+ VMOV.I8 Q9,#0
+ LDR r4,[sp,#0xC8] @I Loads pu1_avail
+
+ MOV r7,r12 @row count, move ht_tmp to r7
+ VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ LDRB r4,[r4,#2] @I pu1_avail[2]
+
+ LDRB r5,[r8,#16] @I pu1_src_cpy[src_strd + 16]
+ VMOV.8 D18[0],r5 @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+ VEXT.8 Q9,Q8,Q9,#1 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ CMP r4,#0 @I
+ BNE SIGN_UP_CHANGE_DONE @I
+
+SIGN_UP_CHANGE:
+ SUB r2,r12,r7 @I ht_tmp - row
+ LDRB r11,[r0] @I pu1_src_cpy[0]
+ ADD r2,r14,r2 @I pu1_src_left_cpy[ht_tmp - row]
+
+ LDRB r5,[r2,#-1] @I load the value
+ SUBS r4,r11,r5 @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ MVNLT r4,#0 @I
+ MOVGT r4,#1 @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ VMOV.8 D14[0],r4 @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE:
+ VCGT.U8 Q5,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VADD.I8 Q12,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up)
+
+ VCLT.U8 Q9,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q5,Q9,Q5 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q12,Q12,Q5 @I edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D18,{D6},D24 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D19,{D6},D25 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q7,Q5 @I sign_up = vnegq_s8(sign_down)
+ VTBL.8 D10,{D7},D18 @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#15 @I sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D11,{D7},D19 @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VADDW.S8 Q10,Q10,D10 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMOVL.U8 Q11,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VMOV Q6,Q8 @I pu1_cur_row = pu1_next_row
+
+ VADDW.S8 Q11,Q11,D11 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VMAX.S16 Q11,Q11,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1
+
+ VMIN.U16 Q11,Q11,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D21,Q11 @I vmovn_s16(pi2_tmp_cur_row.val[1])
+
+PU1_SRC_LOOP:
+
+ VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ ADD r8,r0,r1 @II iteration *pu1_src + src_strd
+
+ VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ ADD r11,r8,r1 @III iteration *pu1_src + src_strd
+
+ LDRB r5,[r8,#16] @II pu1_src_cpy[src_strd + 16]
+ VLD1.8 D30,[r11]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r11] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r11,#8
+ LDRB r4,[r0] @II pu1_src_cpy[0]
+
+ LDRB r8,[r11,#16] @III pu1_src_cpy[src_strd + 16]
+ VMOV.8 D28[0],r5 @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+ SUB r5,r12,r7 @II ht_tmp - row
+ VEXT.8 Q11,Q8,Q14,#1 @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ ADD r5,r14,r5 @II pu1_src_left_cpy[ht_tmp - row]
+
+ LDRB r5,[r5,#-1] @II load the value
+ VMOV.8 D18[0],r8 @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1
+
+ SUBS r4,r4,r5 @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ VEXT.8 Q9,Q15,Q9,#1 @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ LDRB r2,[r0,r1] @III pu1_src_cpy[0]
+
+ VCGT.U8 Q12,Q6,Q11 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB r5,r12,r7 @III ht_tmp - row
+
+ MVNLT r4,#0 @II
+ VCLT.U8 Q11,Q6,Q11 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD r5,r14,r5 @III pu1_src_left_cpy[ht_tmp - row]
+
+ MOVGT r4,#1 @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ VSUB.U8 Q12,Q11,Q12 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ LDRB r5,[r5,#-1] @III load the value
+
+ SUBS r2,r2,r5 @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ VMOV.8 D14[0],r4 @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+ MVNLT r2,#0 @III
+ VCGT.U8 Q5,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ MOVGT r2,#1 @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+
+ VADD.I8 Q11,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q11,Q11,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VCLT.U8 Q9,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down)
+
+ VSUB.U8 Q5,Q9,Q5 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VTBL.8 D23,{D6},D23 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#15 @II sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ VAND Q11,Q11,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask)
+ VMOV.8 D14[0],r2 @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+ VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up)
+ VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VADD.I8 Q9,Q9,Q5 @III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VMOVL.U8 Q13,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D18,{D6},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q5 @III sign_up = vnegq_s8(sign_down)
+
+ VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D19,{D6},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#15 @III sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask)
+ VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VTBL.8 D10,{D7},D18 @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VADDW.S8 Q10,Q10,D10 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VTBL.8 D25,{D7},D23 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMOVL.U8 Q14,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VADDW.S8 Q14,Q14,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VTBL.8 D11,{D7},D19 @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ VMOVL.U8 Q9,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ VMOV Q6,Q15 @III pu1_cur_row = pu1_next_row
+ VMOVN.I16 D26,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VMOVN.I16 D27,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+ VADDW.S8 Q9,Q9,D11 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMAX.S16 Q9,Q9,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1
+ VMIN.U16 Q9,Q9,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ CMP r7,#1 @III
+
+ VST1.8 {Q13},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ BGT PU1_SRC_LOOP @III If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+ VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ ADD r8,r0,r1 @*pu1_src + src_strd
+
+ LDRB r2,[r0] @pu1_src_cpy[0]
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ LDRB r5,[r8,#16] @pu1_src_cpy[src_strd + 16]
+
+ SUB r11,r12,r7 @ht_tmp - row
+ VMOV.8 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ ADD r11,r14,r11 @pu1_src_left_cpy[ht_tmp - row]
+
+ LDRB r5,[r11,#-1] @load the value
+ VEXT.8 Q9,Q8,Q9,#1 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ SUBS r4,r2,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+
+ VCGT.U8 Q5,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ MVNLT r4,#0
+
+ MOVGT r4,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ VCLT.U8 Q9,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VMOV.8 D14[0],r4 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ VSUB.U8 Q5,Q9,Q5 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q9,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q9,Q9,Q5 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VTBL.8 D18,{D6},D18 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q5 @sign_up = vnegq_s8(sign_down)
+
+ VTBL.8 D19,{D6},D19 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#15 @sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ VAND Q9,Q9,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VTBL.8 D10,{D7},D18 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D11,{D7},D19 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VADDW.S8 Q10,Q10,D10 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMOVL.U8 Q6,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VADDW.S8 Q6,Q6,D11 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMAX.S16 Q6,Q6,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VMIN.U16 Q6,Q6,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ VMOVN.I16 D21,Q6 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+INNER_LOOP_DONE:
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+
+ LDR r8,[sp,#0xD4] @Loads ht
+ SUB r5,r5,#1
+
+ SUB r2,r2,#1
+SRC_LEFT_LOOP:
+ LDRB r7,[r5,#1]! @au1_src_left_tmp[row]
+ SUBS r8,r8,#1
+ STRB r7,[r2,#1]! @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUB r6,r6,#16 @Decrement the wd loop count by 16
+ CMP r6,#8 @Check whether residue remains
+ BLT RE_ASSINING_LOOP @Jump to re-assigning loop
+ LDR r7,[sp,#0xD0] @Loads wd
+ LDR r0,[sp,#0x90] @Loads *pu1_src
+ SUB r7,r7,r6
+ ADD r0,r0,r7
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+ LDR r7,[sp,#0xD0] @Loads wd
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ CMP r6,r7 @col == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+ MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ CMP r6,#16 @if(col == 16)
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB r8,[r5,#2] @pu1_avail[2]
+ CMP r8,#0
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ MOVNE r8,r3
+ SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+ LDR r7,[sp,#0xD0] @Loads wd
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ SUB r8,#8
+ ADD r3,r3,#16
+
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+ LDR r4,[sp,#0xD4] @Loads ht
+
+ SUB r7,r7,r6 @(wd - col)
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ LDR r8,[sp,#0xC0] @Loads *pu1_src
+
+ ADD r7,r7,#15 @15 + (wd - col)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+ SUB r5,r5,#1
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRB r8,[r7],r1 @load the value and increment by src_strd
+ SUBS r4,r4,#1 @decrement the loop count
+ STRB r8,[r5,#1]! @store it in the stack pointer
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ VMOV.I8 Q9,#0
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ ADD r8,r0,r1 @*pu1_src + src_strd
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+
+ LDRB r5,[r8,#16] @pu1_src_cpy[src_strd + 16]
+ VMOV.8 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ VEXT.8 Q9,Q8,Q9,#1 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+ CMP r7,r12
+ BLT SIGN_UP_CHANGE_WD_16_HT_4
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB r8,[r0] @pu1_src_cpy[0]
+ SUB r5,r12,r7 @ht_tmp - row
+ ADD r5,r14,r5 @pu1_src_left_cpy[ht_tmp - row]
+ LDRB r5,[r5,#-1] @load the value
+ SUBS r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ VMOV.8 d14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VEXT.8 Q7,Q7,Q7,#15 @sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ LDR r8,[sp,#0xD4] @Loads ht
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ SUB r5,r5,#1
+ SUB r2,r2,#1
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRB r7,[r5,#1]! @au1_src_left_tmp[row]
+ STRB r7,[r2,#1]! @pu1_src_left[row] = au1_src_left_tmp[row]
+ SUBS r8,r8,#1
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+ SUBS r6,r6,#16 @Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP @Jump to re-assigning loop
+
+
+WIDTH_RESIDUE:
+ LDR r7,[sp,#0xD0] @Loads wd
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ CMP r6,r7 @wd_residue == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+
+ MOVNE r8,#-1
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+ LDRB r11,[r5,#2] @pu1_avail[2]
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+ CMP r11,#0
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ MOVNE r8,r3
+
+ SUB r8,r8,#1
+
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+ VLD1.8 D11,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+ LDR r7,[sp,#0xD0] @Loads wd
+
+ LDR r4,[sp,#0xD4] @Loads ht
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ SUB r7,r7,#1 @(wd - 1)
+
+ LDR r8,[sp,#0xC0] @Loads *pu1_src
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB r5,r5,#1
+
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)]
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRB r8,[r7],r1 @load the value and increment by src_strd
+ SUBS r4,r4,#1 @decrement the loop count
+ STRB r8,[r5,#1]! @store it in the stack pointer
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+ VMOV.I8 Q9,#0
+ ADD r8,r0,r1 @*pu1_src + src_strd
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+
+ LDRB r8,[r8,#16] @pu1_src_cpy[src_strd + 16]
+ VMOV.8 d18[0],r8 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ VEXT.8 Q9,Q8,Q9,#1 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+ CMP r7,r12
+ BLT SIGN_UP_CHANGE_RESIDUE
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB r8,[r0] @pu1_src_cpy[0]
+ SUB r5,r12,r7 @ht_tmp - row
+
+ ADD r5,r14,r5
+ LDRB r5,[r5,#-1] @load the value
+ SUBS r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ VMOV.8 d14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VEXT.8 Q7,Q7,Q7,#15 @sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ SUBS r7,r7,#1
+ BNE PU1_SRC_LOOP_RESIDUE
+
+ LDR r8,[sp,#0xD4] @Loads ht
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+
+ LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ SUB r5,r5,#1
+
+ SUB r2,r2,#1
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDRB r7,[r5,#1]! @au1_src_left_tmp[row]
+ SUBS r8,r8,#1
+ STRB r7,[r2,#1]! @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ LDR r8,[sp,#0xD4] @Loads ht
+ LDR r7,[sp,#0xD0] @Loads wd
+
+ LDR r0,[sp,#0xC0] @Loads *pu1_src
+ SUB r8,r8,#1 @ht - 1
+
+ MLA r6,r8,r1,r7 @wd - 1 + (ht - 1) * src_strd
+ STRB r9,[r0] @pu1_src_org[0] = u1_pos_0_0_tmp
+
+ LDR r4,[sp,#0xBC] @Loads pu1_src_top_left
+ ADD r6,r0,r6 @pu1_src[wd - 1 + (ht - 1) * src_strd]
+
+ ADD r12,sp,#0x02
+ STRB r10,[r6,#-1] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+
+ LDRB r11,[sp] @load u1_src_top_left_tmp from stack pointer
+ LDR r3,[sp,#0xCC] @Loads pu1_src_top
+
+ STRB r11,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
+
+SRC_TOP_LOOP:
+ VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS r7,r7,#8 @Decrement the width
+ VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0x94
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
new file mode 100644
index 0000000..c6fb391
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s
@@ -0,0 +1,1001 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class2_chroma.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset_u,
+@ WORD8 *pi1_sao_offset_v,
+@ WORD32 wd,
+@ WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => *pu1_avail
+@r6 => *pi1_sao_offset_u
+@r9 => *pi1_sao_offset_v
+@r7 => wd
+@r8=> ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_chroma_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+gi1_table_edge_idx_addr_4:
+.long gi1_table_edge_idx - ulbl4 - 8
+
+gi1_table_edge_idx_addr_5:
+.long gi1_table_edge_idx - ulbl5 - 8
+
+ihevc_sao_edge_offset_class2_chroma_a9q:
+
+
+ STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+
+ LDR r7,[sp,#0x40] @Loads wd
+ LDR r8,[sp,#0x44] @Loads ht
+ SUB r9,r7,#2 @wd - 2
+
+ LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDRH r10,[r3,r9] @pu1_src_top[wd - 2]
+
+ STR r0,[sp,#0x2C] @Store pu1_src in sp
+ MOV r9,r7 @Move width to r9 for loop count
+
+ STR r2,[sp,#0x30] @Store pu1_src_left in sp
+ LDR r5,[sp,#0x34] @Loads pu1_avail
+ LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u
+
+ STR r3,[sp,#0x38] @Store pu1_src_top in sp
+ SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values
+
+ STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2]
+ SUB r10,r8,#1 @ht-1
+ MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
+ ADD r12,sp,#10 @temp array
+
+AU1_SRC_TOP_LOOP:
+ VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
+ SUBS r9,r9,#8 @Decrement the loop count by 8
+ VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP_U:
+ LDRB r9,[r5,#4] @pu1_avail[4]
+ CMP r9,#0
+ LDRB r9,[r0] @u1_pos_0_0_tmp_u = pu1_src[0]
+ LDRB r10,[r0,#1] @u1_pos_0_0_tmp_v = pu1_src[1]
+ BEQ PU1_AVAIL_7_LOOP_U
+
+ LDRB r11,[r4] @pu1_src_top_left[0]
+ ADD r14,r0,r1 @pu1_src + src_strd
+
+ SUB r12,r9,r11 @pu1_src[0] - pu1_src_top_left[0]
+
+ LDRB r14,[r14,#2] @pu1_src[2 + src_strd]
+ CMP r12,#0
+
+ MVNLT r12,#0
+ SUB r11,r9,r14 @pu1_src[0] - pu1_src[2 + src_strd]
+
+ MOVGT r12,#1 @SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+ CMP r11,#0
+ MVNLT r11,#0
+ LDR r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+ add r14,r14,pc
+ MOVGT r11,#1 @SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+
+ ADD r11,r12,r11 @SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+ ADD r11,r11,#2 @edge_idx
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0 @0 != edge_idx
+ BEQ PU1_AVAIL_4_LOOP_V
+ LDRSB r11,[r6,r12] @pi1_sao_offset_u[edge_idx]
+ ADD r9,r9,r11 @pu1_src[0] + pi1_sao_offset_u[edge_idx]
+ USAT r9,#8,r9 @u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_4_LOOP_V:
+
+ LDRB r11,[r4,#1] @pu1_src_top_left[1]
+ ADD r14,r0,r1 @pu1_src + src_strd
+
+ SUB r12,r10,r11 @pu1_src[1] - pu1_src_top_left[1]
+ LDRB r14,[r14,#3] @pu1_src[3 + src_strd]
+
+ CMP r12,#0
+ MVNLT r12,#0
+ SUB r11,r10,r14 @pu1_src[1] - pu1_src[3 + src_strd]
+ MOVGT r12,#1 @SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+ CMP r11,#0
+ MVNLT r11,#0
+ LDR r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+ add r14,r14,pc
+ MOVGT r11,#1 @SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+
+ ADD r11,r12,r11 @SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+ ADD r11,r11,#2 @edge_idx
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0 @0 != edge_idx
+ BEQ PU1_AVAIL_7_LOOP_U
+ LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx]
+ ADD r10,r10,r11 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
+ USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_U:
+ STRB r10,[sp,#7]
+ STRB r9,[sp,#6]
+
+ LDRB r10,[r5,#7] @pu1_avail[7]
+ CMP r10,#0
+ SUB r10,r7,#2 @wd - 2
+ SUB r11,r8,#1 @ht - 1
+ MLA r12,r11,r1,r10 @wd - 2 + (ht - 1) * src_strd
+ ADD r12,r12,r0 @pu1_src[wd - 2 + (ht - 1) * src_strd]
+ LDRB r10,[r12] @u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
+ LDRB r9,[r12,#1] @u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
+ BEQ PU1_AVAIL_3_LOOP
+
+ SUB r11,r12,r1 @pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
+ SUB r11,r11,#2 @pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+ LDRB r11,[r11] @Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+ SUB r11,r10,r11 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
+
+ ADD r14,r12,r1 @pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
+ ADD r14,r14,#2 @pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+ LDRB r14,[r14] @Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+ SUB r14,r10,r14 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ CMP r14,#0
+ MVNLT r14,#0
+ MOVGT r14,#1 @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
+
+ ADD r11,r11,r14 @Add 2 sign value
+ ADD r11,r11,#2 @edge_idx
+ LDR r14, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+ add r14,r14,pc
+
+ LDRSB r14,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r14,#0
+ BEQ PU1_AVAIL_7_LOOP_V
+ LDRSB r11,[r6,r14] @pi1_sao_offset_u[edge_idx]
+ ADD r10,r10,r11 @pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_V:
+ ADD r12,r12,#1
+ SUB r11,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+ SUB r11,r11,#2 @pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+ LDRB r11,[r11] @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+ SUB r11,r9,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
+
+ ADD r14,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+ ADD r14,r14,#2 @pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ LDRB r14,[r14] @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ SUB r14,r9,r14 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ CMP r14,#0
+ MVNLT r14,#0
+ MOVGT r14,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+ ADD r11,r11,r14 @Add 2 sign value
+ ADD r11,r11,#2 @edge_idx
+ LDR r14, gi1_table_edge_idx_addr_4 @table pointer
+ulbl4:
+ add r14,r14,pc
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0
+ BEQ PU1_AVAIL_3_LOOP
+ LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx]
+ ADD r9,r9,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+ STRB r10,[sp,#8]
+ VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2)
+ STRB r9,[sp,#9]
+
+ MOV r12,r8 @Move ht
+ VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
+ MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
+
+ LDRB r11,[r5,#3] @pu1_avail[3]
+ VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ CMP r11,#0
+
+ SUBEQ r12,r12,#1 @ht_tmp--
+ LDRB r5,[r5,#2] @pu1_avail[2]
+
+ CMP r5,#0
+
+ ADDEQ r0,r0,r1 @pu1_src += src_strd
+ VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+ SUBEQ r12,r12,#1 @ht_tmp--
+
+ LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v
+ ADDEQ r14,r14,#2 @pu1_src_left_cpy += 2
+
+ STR r0,[sp,#2] @Store pu1_src in sp
+ VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+ LDR r2, gi1_table_edge_idx_addr_5 @table pointer
+ulbl5:
+ add r2,r2,pc
+
+ MOV r6,r7 @move wd to r6 loop_count
+ VMOV.S8 Q4,#0XFF @au1_mask = vdupq_n_s8(-1)
+ CMP r7,#16 @Compare wd with 16
+
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP r8,#4 @Compare ht with 4
+ BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r7,[sp,#0x114] @Loads wd
+ CMP r6,r7 @col == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+
+ MOVNE r8,#-1
+ VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP r6,#16 @if(col == 16)
+ VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ BNE SKIP_AU1_MASK_VAL
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ LDRB r9,[r5,#2] @pu1_avail[2]
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+ CMP r9,#0
+
+ LDR r4,[sp,#0x118] @Loads ht
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+
+ LDR r7,[sp,#0x114] @Loads wd
+ MOVNE r8,r3 @pu1_src_top_cpy
+
+ SUB r8,r8,#2 @pu1_src - src_strd - 2
+ ADD r3,r3,#16
+
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ SUB r8,#8
+ SUB r7,r7,r6 @(wd - col)
+
+ ADD r7,r7,#14 @15 + (wd - col)
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ LDR r8,[sp,#0x100] @Loads *pu1_src
+
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+AU1_SRC_LEFT_LOOP:
+ LDRH r8,[r7] @load the value and increment by src_strd
+ SUBS r4,r4,#1 @decrement the loop count
+
+ STRH r8,[r5],#2 @store it in the stack pointer
+ ADD r7,r7,r1
+
+ BNE AU1_SRC_LEFT_LOOP
+
+ ADD r8,r0,r1 @I *pu1_src + src_strd
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+ VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+
+ ADD r8,r8,#16 @I
+ VMOV.I8 Q9,#0
+ LDRH r5,[r8] @I pu1_src_cpy[src_strd + 16]
+
+ LDR r10,[sp,#0x108] @I Loads pu1_avail
+ VMOV.16 D18[0],r5 @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ LDRB r10,[r10,#2] @I pu1_avail[2]
+
+ CMP r10,#0 @I
+ VEXT.8 Q9,Q8,Q9,#2 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ BNE SIGN_UP_CHANGE_DONE @I
+
+ LDRB r11,[r0] @I pu1_src_cpy[0]
+ SUB r4,r12,r7 @I ht_tmp - row
+
+ LDRB r10,[r0,#1] @I pu1_src_cpy[0]
+ LSL r4,r4,#1 @I (ht_tmp - row) * 2
+
+ ADD r9,r14,r4 @I pu1_src_left_cpy[(ht_tmp - row) * 2]
+ LDRB r5,[r9,#-2] @I load the value
+
+ SUB r8,r11,r5 @I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ LDRB r5,[r9,#-1] @I load the value
+
+ CMP r8,#0 @I
+ SUB r4,r10,r5 @I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+
+ MVNLT r8,#0 @I
+ MOVGT r8,#1 @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ CMP r4,#0 @I
+ VMOV.8 D14[0],r8 @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ MVNLT r4,#0 @I
+
+ MOVGT r4,#1 @I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VMOV.8 D14[1],r4 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+ VLD1.8 D30,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VCGT.U8 Q10,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VCLT.U8 Q11,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q11,Q11,Q10 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q9,Q9,Q11 @I edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VTBL.8 D18,{D30},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q11 @I sign_up = vnegq_s8(sign_down)
+
+ VTBL.8 D19,{D30},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#14 @I sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VAND Q11,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VMOVL.U8 Q9,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VUZP.8 D22,D23 @I
+
+ VTBL.8 D22,{D6},D22 @I
+ VTBL.8 D23,{D7},D23 @I
+ VZIP.8 D22,D23 @I
+
+ VMOV Q6,Q8 @I pu1_cur_row = pu1_next_row
+ VADDW.S8 Q10,Q10,D22 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VADDW.S8 Q9,Q9,D23 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q9,Q9,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMIN.U16 Q9,Q9,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1
+
+
+PU1_SRC_LOOP:
+ ADD r8,r0,r1,LSL #1 @II *pu1_src + src_strd
+ VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0])
+ ADD r11,r8,r1 @III *pu1_src + src_strd
+
+ VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ VLD1.8 D30,[r11]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r11] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r11,#8
+
+ ADD r8,r8,#16 @II
+ VMOVN.I16 D21,Q9 @I vmovn_s16(pi2_tmp_cur_row.val[1])
+ LDRH r5,[r8] @II pu1_src_cpy[src_strd + 16]
+
+ ADD r11,r11,#16 @III
+ VMOV.16 D28[0],r5 @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ LDRH r4,[r11] @III pu1_src_cpy[src_strd + 16]
+
+ LDRB r8,[r0,r1] @II pu1_src_cpy[0]
+ VEXT.8 Q14,Q8,Q14,#2 @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ SUB r5,r12,r7 @II ht_tmp - row
+
+ LSL r5,r5,#1 @II (ht_tmp - row) * 2
+ VMOV.16 D18[0],r4 @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ ADD r9,r14,r5 @II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ LDRB r11,[r9,#-2] @II load the value
+ VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ SUB r8,r8,r11 @II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+ CMP r8,#0 @II
+ VEXT.8 Q9,Q15,Q9,#2 @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ LDRB r11,[r0,#1] @II pu1_src_cpy[0]
+
+ MVNLT r8,#0 @II
+ VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ MOVGT r8,#1 @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ LDRB r5,[r9,#-1] @II load the value
+ VMOV.8 D14[0],r8 @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1
+
+ SUB r11,r11,r5 @II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ VCLT.U8 Q12,Q6,Q14 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ CMP r11,#0 @II
+
+ MVNLT r11,#0 @II
+ VSUB.U8 Q12,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOVGT r11,#1 @II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ LDRB r4,[r0,r1] @III pu1_src_cpy[0]
+ VLD1.8 D22,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ SUB r5,r12,r7 @III ht_tmp - row
+
+ ADD r10,r0,r1
+ VMOV.8 D14[1],r11 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ LSL r5,r5,#1 @III (ht_tmp - row) * 2
+
+ ADD r9,r14,r5 @III pu1_src_left_cpy[(ht_tmp - row) * 2]
+ VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up)
+ LDRB r10,[r10,#1] @III pu1_src_cpy[0]
+
+ LDRB r5,[r9,#-2] @III load the value
+ VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+ SUB r4,r4,r5 @III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+ CMP r4,#0 @III
+ LDRB r9,[r9,#-1] @III load the value
+ VTBL.8 D26,{D22},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down)
+
+ MVNLT r4,#0 @III
+ SUB r10,r10,r9 @III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ VTBL.8 D27,{D22},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#14 @II sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ MOVGT r4,#1 @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask)
+ CMP r10,#0 @III
+
+ VUZP.8 D26,D27 @II
+ VMOV.8 d14[0],r4 @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+ MVNLT r10,#0 @III
+ MOVGT r10,#1 @III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VTBL.8 D24,{D6},D26 @II
+ VCGT.U8 Q10,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VCLT.U8 Q11,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VTBL.8 D25,{D7},D27 @II
+ VSUB.U8 Q11,Q11,Q10 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VMOV.8 D14[1],r10 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ VZIP.8 D24,D25 @II
+
+ VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up)
+
+ VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VADD.I8 Q9,Q9,Q11 @III edge_idx = vaddq_s8(edge_idx, sign_down)
+ VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VTBL.8 D18,{D20},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q11 @III sign_up = vnegq_s8(sign_down)
+
+ VTBL.8 D19,{D20},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#14 @III sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VUZP.8 D18,D19 @III
+ VTBL.8 D22,{D6},D18 @III
+ VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMOV Q6,Q15 @III pu1_cur_row = pu1_next_row
+ VTBL.8 D23,{D7},D19 @III
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VZIP.8 D22,D23 @III
+ VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+ VADDW.S8 Q10,Q10,D22 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMOVL.U8 Q9,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ VADDW.S8 Q9,Q9,D23 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1
+ VMAX.S16 Q9,Q9,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ CMP r7,#1
+
+ VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMIN.U16 Q9,Q9,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+ ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd
+ VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ LDRB r11,[r0,r1] @pu1_src_cpy[0]
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ SUB r4,r12,r7 @ht_tmp - row
+
+ ADD r8,r8,#16
+ VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1])
+ LDRH r5,[r8] @pu1_src_cpy[src_strd + 16]
+
+ LSL r4,r4,#1 @(ht_tmp - row) * 2
+ VMOV.16 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ ADD r9,r14,r4 @pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ LDRB r5,[r9,#-2] @load the value
+ VEXT.8 Q9,Q8,Q9,#2 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ SUB r8,r11,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+ CMP r8,#0
+ VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ MVNLT r8,#0
+
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VLD1.8 D30,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+
+ LDRB r11,[r0,#1] @pu1_src_cpy[0]
+ VMOV.8 D14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ LDRB r5,[r9,#-1] @load the value
+
+ SUB r4,r11,r5 @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ CMP r4,#0
+
+ MVNLT r4,#0
+ VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ MOVGT r4,#1 @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ VMOV.8 D14[1],r4 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VTBL.8 D26,{D30},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D30},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VMOVL.U8 Q9,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VUZP.8 D26,D27
+
+ VTBL.8 D24,{D6},D26
+ VTBL.8 D25,{D7},D27
+ VZIP.8 D24,D25
+
+ VADDW.S8 Q10,Q10,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VADDW.S8 Q9,Q9,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q9,Q9,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+ LDR r8,[sp,#0x118] @Loads ht
+ VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+
+ LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ VMOVN.I16 D21,Q9 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+SRC_LEFT_LOOP:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ SUBS r8,r8,#2
+ STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUBS r6,r6,#16 @Decrement the wd loop count by 16
+ VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ CMP r6,#8 @Check whether residue remains
+
+ BLT RE_ASSINING_LOOP @Jump to re-assigning loop
+ LDR r7,[sp,#0x114] @Loads wd
+ LDR r0,[sp,#0x02] @Loads *pu1_src
+ SUB r7,r7,r6
+ ADD r0,r0,r7
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDR r7,[sp,#0x114] @Loads wd
+ CMP r6,r7 @col == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+
+ MOVNE r8,#-1
+ VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP r6,#16 @if(col == 16)
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB r8,[r5,#2] @pu1_avail[2]
+ CMP r8,#0
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ MOVNE r8,r3 @pu1_src_top_cpy
+ SUB r8,r8,#2 @pu1_src - src_strd - 2
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ SUB r8,#8
+
+ ADD r3,r3,#16
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r4,[sp,#0x118] @Loads ht
+ LDR r7,[sp,#0x114] @Loads wd
+ SUB r7,r7,r6 @(wd - col)
+ ADD r7,r7,#14 @15 + (wd - col)
+ LDR r8,[sp,#0x100] @Loads *pu1_src
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRH r8,[r7] @load the value and increment by src_strd
+ STRH r8,[r5],#2 @store it in the stack pointer
+ ADD r7,r7,r1
+
+ SUBS r4,r4,#1 @decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VMOV.I8 Q9,#0
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ VMOV.I8 Q9,#0
+ ADD r8,r0,r1 @*pu1_src + src_strd
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+
+ ADD r8,r8,#16
+ LDRH r5,[r8] @pu1_src_cpy[src_strd + 16]
+ VMOV.16 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ VEXT.8 Q9,Q8,Q9,#2 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+ CMP r7,r12
+ BLT SIGN_UP_CHANGE_WD_16_HT_4
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB r8,[r0] @pu1_src_cpy[0]
+ SUB r5,r12,r7 @ht_tmp - row
+ LSL r5,r5,#1 @(ht_tmp - row) * 2
+ ADD r9,r14,r5 @pu1_src_left_cpy[(ht_tmp - row) * 2]
+ LDRB r5,[r9,#-2] @load the value
+ SUB r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ CMP r8,#0
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VMOV.8 d14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+ LDRB r8,[r0,#1] @pu1_src_cpy[0]
+ LDRB r5,[r9,#-1] @load the value
+ SUB r8,r8,r5 @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ CMP r8,#0
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VMOV.8 d14[1],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VLD1.8 D22,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VTBL.8 D26,{D22},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D22},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VEXT.8 Q7,Q7,Q7,#14 @sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ VUZP.8 D26,D27
+ VTBL.8 D24,{D6},D26
+ VTBL.8 D25,{D7},D27
+ VZIP.8 D24,D25
+
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVL.U8 Q13,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q13,Q13,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VMOVN.I16 D29,Q13 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ LDR r8,[sp,#0x118] @Loads ht
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r11,[sp,#0x104] @Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+
+ SUBS r8,r8,#2
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+
+ SUBS r6,r6,#16 @Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP @Jump to re-assigning loop
+ BGT WD_16_HT_4_LOOP
+
+
+WIDTH_RESIDUE:
+ LDR r7,[sp,#0x114] @Loads wd
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ CMP r6,r7 @wd_residue == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+
+ MOVNE r8,#-1
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ VMOV.8 d8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 d8[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+ LDRB r8,[r5,#2] @pu1_avail[2]
+ CMP r8,#0
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ MOVNE r8,r3
+ SUB r8,r8,#2 @pu1_src - src_strd - 2
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+ SUB r8,#8
+
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r4,[sp,#0x118] @Loads ht
+ LDR r7,[sp,#0x114] @Loads wd
+ LDR r8,[sp,#0x100] @Loads *pu1_src
+ SUB r7,r7,#2 @(wd - 2)
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRH r8,[r7] @load the value and increment by src_strd
+ STRH r8,[r5],#2 @store it in the stack pointer
+ ADD r7,r7,r1
+ SUBS r4,r4,#1 @decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+ VMOV.I8 Q9,#0
+ ADD r8,r0,r1 @*pu1_src + src_strd
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+
+ ADD r8,r8,#16
+ LDRH r5,[r8] @pu1_src_cpy[src_strd + 16]
+ VMOV.16 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ VEXT.8 Q9,Q8,Q9,#2 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+ CMP r7,r12
+ BLT SIGN_UP_CHANGE_RESIDUE
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB r8,[r0] @pu1_src_cpy[0]
+ SUB r5,r12,r7 @ht_tmp - row
+ LSL r5,r5,#1 @(ht_tmp - row) * 2
+ ADD r9,r14,r5 @pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ LDRB r5,[r9,#-2] @load the value
+ SUB r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ CMP r8,#0
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VMOV.8 d14[0],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+ LDRB r8,[r0,#1] @pu1_src_cpy[0]
+ LDRB r5,[r9,#-1] @load the value
+ SUB r8,r8,r5 @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ CMP r8,#0
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ VMOV.8 d14[1],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VLD1.8 D22,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VTBL.8 D26,{D22},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D22},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VEXT.8 Q7,Q7,Q7,#14 @sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ VUZP.8 D26,D27
+ VTBL.8 D24,{D6},D26
+ VTBL.8 D25,{D7},D27
+ VZIP.8 D24,D25
+
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VST1.8 {D28},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
+
+ LDR r8,[sp,#0x118] @Loads ht
+ LDR r11,[sp,#0x104] @Loads *pu1_src_left
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ SUBS r8,r8,#2
+ STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ LDR r8,[sp,#0x118] @Loads ht
+
+ LDR r0,[sp,#0x100] @Loads *pu1_src
+ SUB r8,r8,#1 @ht - 1
+
+ LDR r7,[sp,#0x114] @Loads wd
+
+ LDRH r9,[sp,#6]
+ MLA r6,r8,r1,r7 @wd - 2 + (ht - 1) * src_strd
+
+ STRH r9,[r0] @pu1_src_org[0] = u1_pos_0_0_tmp
+ ADD r6,r0,r6 @pu1_src[wd - 2 + (ht - 1) * src_strd]
+
+ LDRH r9,[sp,#8]
+ ADD r12,sp,#10
+ STRH r9,[r6,#-2] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+ LDR r4,[sp,#0xFC] @Loads pu1_src_top_left
+ LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer
+ STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
+ LDR r3,[sp,#0x10C] @Loads pu1_src_top
+
+SRC_TOP_LOOP:
+ VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS r7,r7,#8 @Decrement the width
+ VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0xD4
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s
new file mode 100644
index 0000000..268d4d8
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class3.s
@@ -0,0 +1,854 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class3.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset,
+@ WORD32 wd,
+@ WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => *pu1_avail
+@r6 => *pi1_sao_offset
+@r7 => wd
+@r8=> ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+ihevc_sao_edge_offset_class3_a9q:
+
+
+ STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+ LDR r7,[sp,#0x3C] @Loads wd
+
+ LDR r8,[sp,#0x40] @Loads ht
+ SUB r9,r7,#1 @wd - 1
+
+ LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDRB r10,[r3,r9] @pu1_src_top[wd - 1]
+
+ MOV r9,r7 @Move width to r9 for loop count
+
+ LDR r5,[sp,#0x34] @Loads pu1_avail
+ LDR r6,[sp,#0x38] @Loads pi1_sao_offset
+ STR r3,[sp,#0x38] @Store pu1_src_top in sp
+
+ SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values
+
+ STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1]
+ SUB r10,r8,#1 @ht-1
+ MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
+ ADD r12,sp,#0x02 @temp array
+
+AU1_SRC_TOP_LOOP:
+ VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
+ SUBS r9,r9,#8 @Decrement the loop count by 8
+ VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP:
+ LDRB r9,[r5,#5] @pu1_avail[5]
+ CMP r9,#0
+ SUB r10,r7,#1 @[wd - 1]
+ LDRB r9,[r0,r10] @u1_pos_0_0_tmp = pu1_src[wd - 1]
+ BEQ PU1_AVAIL_6_LOOP
+
+ LDR r11,[sp,#0xC0] @Load pu1_src_top_right from sp
+ SUB r10,r10,#1 @[wd - 1 - 1]
+
+ LDRB r11,[r11] @pu1_src_top_right[0]
+ SUB r12,r9,r11 @pu1_src[wd - 1] - pu1_src_top_right[0]
+
+ ADD r11,r0,r1 @pu1_src + src_strd
+
+ LDRB r14,[r11,r10] @pu1_src[wd - 1 - 1 + src_strd]
+ CMP r12,#0
+ MVNLT r12,#0
+ SUB r11,r9,r14 @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
+
+ MOVGT r12,#1 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+ LDR r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+ add r14,r14,pc
+ ADD r11,r12,r11 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+ ADD r11,r11,#2 @edge_idx
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0 @0 != edge_idx
+ BEQ PU1_AVAIL_6_LOOP
+ LDRSB r10,[r6,r12] @pi1_sao_offset[edge_idx]
+ ADD r9,r9,r10 @pu1_src[0] + pi1_sao_offset[edge_idx]
+ USAT r9,#8,r9 @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP:
+ LDRB r10,[r5,#6] @pu1_avail[6]
+ SUB r11,r8,#1 @ht - 1
+
+ CMP r10,#0
+ STR r0,[sp,#0xC0] @Store pu1_src in sp
+ MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd]
+
+ LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
+ BEQ PU1_AVAIL_3_LOOP
+
+ LDR r14,[sp,#0xC4] @Load pu1_src_bot_left from sp
+ SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd) - src_strd]
+
+ LDRB r14,[r14] @Load pu1_src_bot_left[0]
+ ADD r11,r11,#1 @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+
+ LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+ SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+
+ SUB r11,r10,r11 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
+
+ CMP r14,#0
+ MVNLT r14,#0
+ MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+ ADD r11,r11,r14 @Add 2 sign value
+
+ LDR r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+ add r14,r14,pc
+ ADD r11,r11,#2 @edge_idx
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0
+ BEQ PU1_AVAIL_3_LOOP
+ LDRSB r11,[r6,r12] @pi1_sao_offset[edge_idx]
+ ADD r10,r10,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+ STR r2,[sp,#0xC4] @Store pu1_src_left in sp
+ MOV r12,r8 @Move ht
+
+ MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
+ VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2)
+ LDRB r11,[r5,#3] @pu1_avail[3]
+
+ CMP r11,#0
+ VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
+ SUBEQ r12,r12,#1 @ht_tmp--
+
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ CMP r5,#0
+
+ ADDEQ r0,r0,r1 @pu1_src += src_strd
+ VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset)
+ SUBEQ r12,r12,#1 @ht_tmp--
+
+ LDR r6, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+ add r6,r6,pc
+ VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
+ ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1
+
+ STR r0,[sp,#0x90] @Store pu1_src in sp
+ VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOV r6,r7 @move wd to r6 loop_count
+
+ CMP r7,#16 @Compare wd with 16
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP r8,#4 @Compare ht with 4
+ BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ LDR r7,[sp,#0xD0] @Loads wd
+
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ CMP r6,r7 @col == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+ MOVNE r8,#-1
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP r6,#16 @if(col == 16)
+ BNE SKIP_AU1_MASK_VAL
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ LDRB r8,[r5,#2] @pu1_avail[2]
+ CMP r8,#0
+
+ LDR r4,[sp,#0xD4] @Loads ht
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+
+ MOVNE r8,r3
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+
+ LDR r7,[sp,#0xD0] @Loads wd
+ ADD r8,r8,#1 @pu1_src - src_strd + 1
+
+ SUB r7,r7,r6 @(wd - col)
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ SUB r8,#8
+ ADD r3,r3,#16
+
+ LDR r8,[sp,#0xC0] @Loads *pu1_src
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+ ADD r7,r7,#15 @15 + (wd - col)
+
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ SUB r5,r5,#1
+
+AU1_SRC_LEFT_LOOP:
+ LDRB r8,[r7],r1 @load the value and increment by src_strd
+ SUBS r4,r4,#1 @decrement the loop count
+ STRB r8,[r5,#1]! @store it in the stack pointer
+ BNE AU1_SRC_LEFT_LOOP
+
+ VMOV.I8 Q9,#0
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ ADD r8,r0,r1 @I *pu1_src + src_strd
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+ SUB r5,r12,r7 @I ht_tmp - row
+ VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ ADD r8,r14,r5 @I pu1_src_left_cpy[ht_tmp - row]
+
+ ADD r8,r8,#1 @I pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB r8,[r8]
+
+ LDR r5,[sp,#0xC8] @I Loads pu1_avail
+ VMOV.8 D19[7],r8 @I vsetq_lane_u8
+ LDRB r5,[r5,#2] @I pu1_avail[2]
+
+ VEXT.8 Q9,Q9,Q8,#15 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+ CMP r5,#0 @I
+ BNE SIGN_UP_CHANGE_DONE @I
+
+SIGN_UP_CHANGE:
+ LDRB r8,[r0,#15] @I pu1_src_cpy[15]
+ SUB r5,r0,r1 @I pu1_src_cpy[16 - src_strd]
+
+ LDRB r5,[r5,#16] @I load the value
+ SUB r8,r8,r5 @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ CMP r8,#0 @I
+ MVNLT r8,#0 @I
+ MOVGT r8,#1 @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ VMOV.8 D15[7],r8 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE:
+ VCGT.U8 Q5,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VCLT.U8 Q9,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q5,Q9,Q5 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q9,Q9,Q5 @I edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D18,{D6},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q5 @I sign_up = vnegq_s8(sign_down)
+
+ VEXT.8 Q7,Q7,Q7,#1 @I sign_up = vextq_s8(sign_up, sign_up, 1)
+ VTBL.8 D19,{D6},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VTBL.8 D10,{D7},D18 @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ VMOVL.U8 Q11,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q10,Q10,D10 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VTBL.8 D11,{D7},D19 @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOV Q6,Q8
+ VADDW.S8 Q11,Q11,D11 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMAX.S16 Q11,Q11,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q11,Q11,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1
+
+PU1_SRC_LOOP:
+ ADD r8,r0,r1,LSL #1 @II *pu1_src + src_strd
+ VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUB r5,r12,r7 @II ht_tmp - row
+
+ ADD r4,r0,r1 @II pu1_src_cpy[16 - src_strd]
+ VMOVN.I16 D21,Q11 @I vmovn_s16(pi2_tmp_cur_row.val[1])
+ ADD r2,r8,r1 @III *pu1_src + src_strd
+
+ LDRB r11,[r4,#15] @II pu1_src_cpy[15]
+ VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1
+
+ ADD r8,r14,r5 @II pu1_src_left_cpy[ht_tmp - row]
+ VLD1.8 D30,[r2]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r2] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r2,#8
+ LDRB r8,[r8,#1]
+
+ LDRB r4,[r0,#16] @II load the value
+ VMOV.8 D19[7],r8 @II vsetq_lane_u8
+ SUB r11,r11,r4 @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+ CMP r11,#0 @II
+ VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ SUB r5,r12,r7 @III ht_tmp - row
+
+ MVNLT r11,#0 @II
+ VEXT.8 Q9,Q9,Q8,#15 @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+ MOVGT r11,#1 @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+ ADD r8,r14,r5 @III pu1_src_left_cpy[ht_tmp - row]
+ VMOV.8 D15[7],r11 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ CMP r7,#1 @III
+
+ BNE NEXT_ROW_ELSE_2 @III
+ LDR r5,[sp,#0xC8] @III Loads pu1_avail
+ LDRB r5,[r5,#3] @III pu1_avail[3]
+ CMP r5,#0 @III
+ SUBNE r8,r2,#2 @III pu1_src_cpy[src_strd - 1]
+
+NEXT_ROW_ELSE_2:
+ LDRB r8,[r8,#1] @III
+ VCGT.U8 Q12,Q6,Q9 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD r5,r0,r1
+
+ LDRB r2,[r5,#15] @III pu1_src_cpy[15]
+ VCLT.U8 Q13,Q6,Q9 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ LDRB r5,[r0,#16] @III load the value
+
+ SUB r2,r2,r5 @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ VSUB.U8 Q12,Q13,Q12 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ CMP r2,#0 @III
+
+ MVNLT r2,#0 @III
+ VMOV.8 D19[7],r8 @III vsetq_lane_u8
+ MOVGT r2,#1 @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+ SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1
+ VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up)
+
+ VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down)
+ VEXT.8 Q9,Q9,Q15,#15 @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+ VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VEXT.8 Q7,Q7,Q7,#1 @II sign_up = vextq_s8(sign_up, sign_up, 1)
+ VTBL.8 D26,{D6},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VCGT.U8 Q5,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VMOV.8 D15[7],r2 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ VTBL.8 D27,{D6},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VCLT.U8 Q9,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VSUB.U8 Q5,Q9,Q5 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VTBL.8 D24,{D7},D26 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up)
+
+ VADD.I8 Q9,Q9,Q5 @III edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D25,{D7},D27 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VNEG.S8 Q7,Q5 @III sign_up = vnegq_s8(sign_down)
+
+ VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D18,{D6},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VEXT.8 Q7,Q7,Q7,#1 @III sign_up = vextq_s8(sign_up, sign_up, 1)
+ VTBL.8 D19,{D6},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VTBL.8 D10,{D7},D18 @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VADDW.S8 Q10,Q10,D10 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D11,{D7},D19 @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMOVL.U8 Q11,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+ VADDW.S8 Q11,Q11,D11 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+ VMAX.S16 Q11,Q11,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMOV Q6,Q15 @II pu1_cur_row = pu1_next_row
+ VMIN.U16 Q11,Q11,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ CMP r7,#1 @III
+ VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+ ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd
+ VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+
+ LDRB r5,[r5,#3] @pu1_avail[3]
+ VMOVN.I16 D21,Q11 @III vmovn_s16(pi2_tmp_cur_row.val[1])
+ CMP r5,#0
+
+ ADD r4,r0,r1 @pu1_src_cpy[16 - src_strd]
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ LDRB r5,[r0,#16] @load the value
+
+ BEQ NEXT_ROW_ELSE_3
+ LDRB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1]
+ B NEXT_ROW_POINTER_ASSIGNED_3
+NEXT_ROW_ELSE_3:
+ SUB r11,r12,r7 @ht_tmp - row
+ ADD r8,r14,r11 @pu1_src_left_cpy[ht_tmp - row]
+ ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB r8,[r8]
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+ LDRB r11,[r4,#15] @pu1_src_cpy[15]
+ VMOV.8 D19[7],r8 @vsetq_lane_u8
+ SUB r8,r11,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+ CMP r8,#0
+ VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+ MVNLT r8,#0
+
+ VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VCGT.U8 Q12,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ VCLT.U8 Q13,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ VSUB.U8 Q12,Q13,Q12 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+
+ VMOVL.U8 Q11,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ VADDW.S8 Q10,Q10,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VADDW.S8 Q11,Q11,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q11,Q11,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q11,Q11,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+INNER_LOOP_DONE:
+ VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ LDR r8,[sp,#0xD4] @Loads ht
+
+ VMOVN.I16 D21,Q11 @vmovn_s16(pi2_tmp_cur_row.val[1])
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+
+ VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+SRC_LEFT_LOOP:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ SUBS r8,r8,#4
+ STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUBS r6,r6,#16 @Decrement the wd loop count by 16
+ CMP r6,#8 @Check whether residue remains
+ BLT RE_ASSINING_LOOP @Jump to re-assigning loop
+ LDR r7,[sp,#0xD0] @Loads wd
+ LDR r0,[sp,#0x90] @Loads *pu1_src
+ SUB r7,r7,r6
+ ADD r0,r0,r7
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+
+
+
+WD_16_HT_4_LOOP:
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDR r7,[sp,#0xD0] @Loads wd
+ CMP r6,r7 @col == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+ MOVNE r8,#-1
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP r6,#16 @if(col == 16)
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB r8,[r5,#2] @pu1_avail[2]
+ CMP r8,#0
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ MOVNE r8,r3
+ ADD r8,r8,#1 @pu1_src - src_strd + 1
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ SUB r8,#8
+
+ ADD r3,r3,#16
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r4,[sp,#0xD4] @Loads ht
+ LDR r7,[sp,#0xD0] @Loads wd
+ SUB r7,r7,r6 @(wd - col)
+ ADD r7,r7,#15 @15 + (wd - col)
+ LDR r8,[sp,#0xC0] @Loads *pu1_src
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+ SUB r5,r5,#1
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRB r8,[r7],r1 @load the value and increment by src_strd
+ STRB r8,[r5,#1]! @store it in the stack pointer
+ SUBS r4,r4,#1 @decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ VMOV.I8 Q9,#0
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ ADD r8,r0,r1 @*pu1_src + src_strd
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDRB r5,[r5,#3] @pu1_avail[3]
+ CMP r5,#0
+ BEQ NEXT_ROW_ELSE_WD_16_HT_4
+ CMP r7,#1
+ LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1]
+ BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+NEXT_ROW_ELSE_WD_16_HT_4:
+ SUB r5,r12,r7 @ht_tmp - row
+ ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row]
+ ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB r8,[r8]
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+ VMOV.8 D19[7],r8 @vsetq_lane_u8
+ VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+ CMP r7,r12
+ BNE SIGN_UP_CHANGE_WD_16_HT_4
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB r8,[r0,#15] @pu1_src_cpy[15]
+ ADD r5,r0,#16 @pu1_src_cpy[16]
+ SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd]
+ LDRB r5,[r5] @load the value
+ SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ CMP r8,#0
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1)
+
+ VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ LDR r8,[sp,#0xD4] @Loads ht
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+ SUBS r8,r8,#4
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+ SUBS r6,r6,#16 @Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP @Jump to re-assigning loop
+ BGT WD_16_HT_4_LOOP @If not equal jump to width_loop
+
+
+WIDTH_RESIDUE:
+ LDR r7,[sp,#0xD0] @Loads wd
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ CMP r6,r7 @wd_residue == wd
+ LDREQB r8,[r5] @pu1_avail[0]
+
+ MOVNE r8,#-1
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+ LDRB r8,[r5,#2] @pu1_avail[2]
+ CMP r8,#0
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ MOVNE r8,r3
+ ADD r8,r8,#1 @pu1_src - src_strd + 1
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ SUB r8,#8
+
+
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+ LDR r4,[sp,#0xD4] @Loads ht
+ LDR r7,[sp,#0xD0] @Loads wd
+ LDR r8,[sp,#0xC0] @Loads *pu1_src
+ SUB r7,r7,#1 @(wd - 1)
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)]
+ SUB r5,r5,#1
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRB r8,[r7],r1 @load the value and increment by src_strd
+ STRB r8,[r5,#1]! @store it in the stack pointer
+ SUBS r4,r4,#1 @decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+ VMOV.I8 Q9,#0
+ ADD r8,r0,r1 @*pu1_src + src_strd
+ VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r8,#8
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDRB r5,[r5,#3] @pu1_avail[3]
+ CMP r5,#0
+ BEQ NEXT_ROW_ELSE_RESIDUE
+ CMP r7,#1
+ LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1]
+ BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+NEXT_ROW_ELSE_RESIDUE:
+ SUB r5,r12,r7 @ht_tmp - row
+ ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row]
+ ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB r8,[r8]
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+ VMOV.8 D19[7],r8 @vsetq_lane_u8
+ VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+ CMP r7,r12
+ BNE SIGN_UP_CHANGE_RESIDUE
+ LDR r5,[sp,#0xC8] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB r8,[r0,#15] @pu1_src_cpy[15]
+ ADD r5,r0,#16 @pu1_src_cpy[16]
+ SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd]
+ LDRB r5,[r5] @load the value
+ SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ CMP r8,#0
+ MVNLT r8,#0
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1)
+
+ VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ SUBS r7,r7,#1
+ BNE PU1_SRC_LOOP_RESIDUE
+
+ LDR r8,[sp,#0xD4] @Loads ht
+ LDR r2,[sp,#0xC4] @Loads *pu1_src_left
+ ADD r5,sp,#0x42 @*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ SUBS r8,r8,#4
+ STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ LDR r7,[sp,#0xD0] @Loads wd
+ LDR r0,[sp,#0xC0] @Loads *pu1_src
+
+ LDR r11,[sp,#0xD4] @Loads ht
+ ADD r8,r0,r7 @pu1_src[wd]
+
+ LDR r4,[sp,#0xBC] @Loads pu1_src_top_left
+ SUB r11,r11,#1 @ht - 1
+
+ STRB r9,[r8,#-1] @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
+ MLA r6,r11,r1,r0 @pu1_src_org[(ht - 1) * src_strd]
+
+ LDRB r8,[sp] @load u1_src_top_left_tmp from stack pointer
+ ADD r12,sp,#0x02
+
+ STRB r10,[r6] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+ STRB r8,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
+ LDR r3,[sp,#0xCC] @Loads pu1_src_top
+
+SRC_TOP_LOOP:
+ VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS r7,r7,#8 @Decrement the width
+ VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0x94
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
new file mode 100644
index 0000000..2ecabe9
--- /dev/null
+++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s
@@ -0,0 +1,1052 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* ,:file
+@* ihevc_sao_edge_offset_class3_chroma.s
+@*
+@* ,:brief
+@* Contains function definitions for inter prediction interpolation.
+@* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+@* RVCT
+@*
+@* ,:author
+@* Parthiban V
+@*
+@* ,:par List of Functions:
+@*
+@*
+@* ,:remarks
+@* None
+@*
+@*******************************************************************************
+@*/
+@void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
+@ WORD32 src_strd,
+@ UWORD8 *pu1_src_left,
+@ UWORD8 *pu1_src_top,
+@ UWORD8 *pu1_src_top_left,
+@ UWORD8 *pu1_src_top_right,
+@ UWORD8 *pu1_src_bot_left,
+@ UWORD8 *pu1_avail,
+@ WORD8 *pi1_sao_offset_u,
+@ WORD8 *pi1_sao_offset_v,
+@ WORD32 wd,
+@ WORD32 ht)
+@**************Variables Vs Registers*****************************************
+@r0 => *pu1_src
+@r1 => src_strd
+@r2 => *pu1_src_left
+@r3 => *pu1_src_top
+@r4 => *pu1_src_top_left
+@r5 => *pu1_avail
+@r6 => *pi1_sao_offset_u
+@r9 => *pi1_sao_offset_v
+@r7 => wd
+@r8=> ht
+
+.text
+.p2align 2
+
+.extern gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_chroma_a9q
+
+gi1_table_edge_idx_addr_1:
+.long gi1_table_edge_idx - ulbl1 - 8
+
+gi1_table_edge_idx_addr_2:
+.long gi1_table_edge_idx - ulbl2 - 8
+
+gi1_table_edge_idx_addr_3:
+.long gi1_table_edge_idx - ulbl3 - 8
+
+gi1_table_edge_idx_addr_4:
+.long gi1_table_edge_idx - ulbl4 - 8
+
+gi1_table_edge_idx_addr_5:
+.long gi1_table_edge_idx - ulbl5 - 8
+
+ihevc_sao_edge_offset_class3_chroma_a9q:
+
+
+ STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments
+
+ LDR r7,[sp,#0x40] @Loads wd
+ LDR r8,[sp,#0x44] @Loads ht
+ SUB r9,r7,#2 @wd - 2
+
+ LDR r4,[sp,#0x28] @Loads pu1_src_top_left
+ LDRH r10,[r3,r9] @pu1_src_top[wd - 2]
+
+ MOV r9,r7 @Move width to r9 for loop count
+
+ LDR r5,[sp,#0x34] @Loads pu1_avail
+ LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u
+
+ STR r3,[sp,#0x38] @Store pu1_src_top in sp
+ SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values
+
+ STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2]
+ SUB r10,r8,#1 @ht-1
+ MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col]
+ ADD r12,sp,#10 @temp array
+
+AU1_SRC_TOP_LOOP:
+ VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col]
+ SUBS r9,r9,#8 @Decrement the loop count by 8
+ VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP_U:
+ LDRB r9,[r5,#5] @pu1_avail[5]
+ CMP r9,#0
+ SUB r14,r7,#2 @[wd - 2]
+ LDRB r9,[r0,r14] @u1_pos_0_0_tmp_u = pu1_src[wd - 2]
+ SUB r11,r7,#1 @[wd - 1]
+ LDRB r10,[r0,r11] @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
+ BEQ PU1_AVAIL_6_LOOP_U
+
+ LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp
+ LDRB r11,[r11] @pu1_src_top_right[0]
+ SUB r12,r9,r11 @pu1_src[wd - 2] - pu1_src_top_right[0]
+ CMP r12,#0
+ MVNLT r12,#0
+ MOVGT r12,#1 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
+ ADD r11,r0,r1 @pu1_src + src_strd
+ SUB r14,r14,#2 @[wd - 2 - 2]
+ LDRB r14,[r11,r14] @pu1_src[wd - 2 - 2 + src_strd]
+ SUB r11,r9,r14 @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+ ADD r11,r12,r11 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+ ADD r11,r11,#2 @edge_idx
+ LDR r14, gi1_table_edge_idx_addr_1 @table pointer
+ulbl1:
+ add r14,r14,pc
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0 @0 != edge_idx
+ BEQ PU1_AVAIL_5_LOOP_V
+ LDRSB r11,[r6,r12] @pi1_sao_offset_u[edge_idx]
+ ADD r9,r9,r11 @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
+ USAT r9,#8,r9 @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_5_LOOP_V:
+
+ LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp
+ LDRB r11,[r11,#1] @pu1_src_top_right[1]
+ SUB r12,r10,r11 @pu1_src[wd - 1] - pu1_src_top_right[1]
+ CMP r12,#0
+ MVNLT r12,#0
+ MOVGT r12,#1 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
+ ADD r11,r0,r1 @pu1_src + src_strd
+ SUB r14,r7,#3 @[wd - 1 - 2]
+ LDRB r14,[r11,r14] @pu1_src[wd - 1 - 2 + src_strd]
+ SUB r11,r10,r14 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+ ADD r11,r12,r11 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+ ADD r11,r11,#2 @edge_idx
+ LDR r14, gi1_table_edge_idx_addr_2 @table pointer
+ulbl2:
+ add r14,r14,pc
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0 @0 != edge_idx
+ BEQ PU1_AVAIL_6_LOOP_U
+ LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx]
+ ADD r10,r10,r11 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
+ USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_U:
+ STRB r9,[sp,#6]
+ STRB r10,[sp,#7]
+ STR r0,[sp,#0x100] @Store pu1_src in sp
+
+ LDRB r10,[r5,#6] @pu1_avail[6]
+ CMP r10,#0
+ SUB r11,r8,#1 @ht - 1
+ MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd]
+ LDRB r10,[r12] @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
+ LDRB r9,[r12,#1] @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
+ BEQ PU1_AVAIL_3_LOOP
+
+ SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd - src_strd]
+ ADD r11,r11,#2 @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ SUB r11,r10,r11 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd])
+
+ LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp
+ LDRB r14,[r14] @Load pu1_src_bot_left[0]
+ SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+ CMP r14,#0
+ MVNLT r14,#0
+ MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+ ADD r11,r11,r14 @Add 2 sign value
+ ADD r11,r11,#2 @edge_idx
+ LDR r14, gi1_table_edge_idx_addr_3 @table pointer
+ulbl3:
+ add r14,r14,pc
+
+ LDRSB r14,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r14,#0
+ BEQ PU1_AVAIL_6_LOOP_V
+ LDRSB r11,[r6,r14] @pi1_sao_offset_u[edge_idx]
+ ADD r10,r10,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_V:
+ ADD r12,r12,#1 @pu1_src[(ht - 1) * src_strd + 1]
+ SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd + 1) - src_strd]
+ ADD r11,r11,#2 @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ SUB r11,r9,r11 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
+ CMP r11,#0
+ MVNLT r11,#0
+ MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
+
+ LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp
+ LDRB r14,[r14,#1] @Load pu1_src_bot_left[1]
+ SUB r14,r9,r14 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
+ CMP r14,#0
+ MVNLT r14,#0
+ MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
+
+ ADD r11,r11,r14 @Add 2 sign value
+ ADD r11,r11,#2 @edge_idx
+ LDR r14, gi1_table_edge_idx_addr_4 @table pointer
+ulbl4:
+ add r14,r14,pc
+
+ LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP r12,#0
+ BEQ PU1_AVAIL_3_LOOP
+ LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v
+ LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx]
+ ADD r9,r9,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+ STRB r10,[sp,#8]
+ STRB r9,[sp,#9]
+ STR r2,[sp,#0x104] @Store pu1_src_left in sp
+
+ MOV r12,r8 @Move ht
+ MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy
+ LDRB r11,[r5,#3] @pu1_avail[3]
+ CMP r11,#0
+ BNE PU1_AVAIL_2_LOOP
+ SUB r12,r12,#1 @ht_tmp--
+
+PU1_AVAIL_2_LOOP:
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE PU1_AVAIL_2_LOOP_END
+
+ ADD r0,r0,r1 @pu1_src += src_strd
+ SUB r12,r12,#1 @ht_tmp--
+ ADD r14,r14,#2 @pu1_src_left_cpy += 2
+
+PU1_AVAIL_2_LOOP_END:
+ STR r0,[sp,#2] @Store pu1_src in sp
+ VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2)
+ VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0)
+ VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+ LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v
+ VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+ LDR r2, gi1_table_edge_idx_addr_5 @table pointer
+ulbl5:
+ add r2,r2,pc
+ @VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1)
+ MOV r6,r7 @move wd to r6 loop_count
+
+ CMP r7,#16 @Compare wd with 16
+ BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP r8,#4 @Compare ht with 4
+ BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ LDR r7,[sp,#0x114] @Loads wd
+ CMP r6,r7 @col == wd
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+
+ LDREQB r8,[r5] @pu1_avail[0]
+ MOVNE r8,#-1
+
+ VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ LDRB r11,[r5,#2] @pu1_avail[2]
+
+ CMP r6,#16 @if(col == 16)
+ VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ BNE SKIP_AU1_MASK_VAL
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ CMP r11,#0
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+ VMOV.I8 Q9,#0
+ MOVNE r8,r3
+
+ ADD r8,r8,#2 @pu1_src - src_strd + 2
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ SUB r8,#8
+ ADD r3,r3,#16
+
+ LDR r4,[sp,#0x118] @Loads ht
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ LDR r7,[sp,#0x114] @Loads wd
+
+ SUB r7,r7,r6 @(wd - col)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD r7,r7,#14 @15 + (wd - col)
+
+ LDR r8,[sp,#0x100] @Loads *pu1_src
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP:
+ LDRH r8,[r7] @load the value and increment by src_strd
+ SUBS r4,r4,#1 @decrement the loop count
+
+ STRH r8,[r5],#2 @store it in the stack pointer
+ ADD r7,r7,r1
+ BNE AU1_SRC_LEFT_LOOP
+
+
+ MOV r7,r12 @row count, move ht_tmp to r7
+ VMOV.I8 Q9,#0 @I
+ ADD r11,r0,r1 @I *pu1_src + src_strd
+
+ SUB r5,r12,r7 @I ht_tmp - row
+ VLD1.8 D16,[r11]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r11] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r11,#8
+ ADD r8,r14,r5,LSL #1 @I pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ LDRH r5,[r8,#2] @I
+ VMOV.16 D19[3],r5 @I vsetq_lane_u8
+ LDR r11,[sp,#0x108] @I Loads pu1_avail
+
+ LDRB r11,[r11,#2] @I pu1_avail[2]
+ VEXT.8 Q9,Q9,Q8,#14 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ CMP r11,#0 @I
+ BNE SIGN_UP_CHANGE_DONE @I
+
+ LDRB r8,[r0,#14] @I pu1_src_cpy[14]
+ SUB r5,r0,r1 @I
+
+ LDRB r11,[r5,#16] @I load the value pu1_src_cpy[16 - src_strd]
+
+ LDRB r9,[r0,#15] @I pu1_src_cpy[15]
+ SUB r8,r8,r11 @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ LDRB r10,[r5,#17] @I load the value pu1_src_cpy[17 - src_strd]
+ CMP r8,#0 @I
+
+ MVNLT r8,#0 @I
+ SUB r9,r9,r10 @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MOVGT r8,#1 @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+ CMP r9,#0 @I
+
+ MVNLT r9,#0 @I
+ VMOV.8 D15[6],r8 @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ MOVGT r9,#1 @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ VMOV.8 D15[7],r9 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+ VLD1.8 D28,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VCGT.U8 Q10,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VCLT.U8 Q11,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q11,Q11,Q10 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q9,Q9,Q11 @I edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D18,{D28},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q11 @I sign_up = vnegq_s8(sign_down)
+
+ VTBL.8 D19,{D28},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#2 @I sign_up = vextq_s8(sign_up, sign_up, 2)
+
+ VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VUZP.8 D18,D19 @I
+ VTBL.8 D22,{D6},D18 @I
+ VTBL.8 D23,{D7},D19 @I
+ VZIP.8 D22,D23 @I
+
+ VMOVL.U8 Q9,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q10,Q10,D22 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOV Q6,Q8 @I pu1_cur_row = pu1_next_row
+ VADDW.S8 Q9,Q9,D23 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1
+ VMAX.S16 Q9,Q9,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMIN.U16 Q9,Q9,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+PU1_SRC_LOOP:
+ ADD r11,r0,r1,LSL #1 @II *pu1_src + src_strd
+ VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUB r5,r12,r7 @II ht_tmp - row
+
+ ADD r4,r0,r1 @III *pu1_src + src_strd
+ VMOVN.I16 D21,Q9 @I vmovn_s16(pi2_tmp_cur_row.val[1])
+ ADD r8,r14,r5,LSL #1 @II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ LDRH r9,[r8,#2]
+ VLD1.8 D16,[r11]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r11] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r11,#8
+ LDRB r10,[r4,#14] @II pu1_src_cpy[14]
+
+ LDRB r8,[r4,#15] @II pu1_src_cpy[15]
+ VMOV.16 D29[3],r9 @II vsetq_lane_u8
+ ADD r4,r11,r1 @III *pu1_src + src_strd
+
+ LDRB r5,[r0,#17] @II load the value pu1_src_cpy[17 - src_strd]
+ VLD1.8 D30,[r4]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D31,[r4] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r4,#8
+ LDRB r11,[r0,#16] @II load the value pu1_src_cpy[16 - src_strd]
+
+ SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1
+ VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ SUB r10,r10,r11 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ CMP r10,#0 @II
+ VEXT.8 Q14,Q14,Q8,#14 @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ SUB r8,r8,r5 @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MVNLT r10,#0 @II
+ VLD1.8 D21,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOVGT r10,#1 @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP r8,#0 @II
+ VMOV.8 D15[6],r10 @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ MVNLT r8,#0 @II
+
+ MOVGT r8,#1 @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ SUB r10,r12,r7 @III ht_tmp - row
+ VMOV.8 D15[7],r8 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ ADD r11,r14,r10,LSL #1 @III pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ CMP r7,#1 @III
+ VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ BNE NEXT_ROW_POINTER_ASSIGNED_2 @III
+
+ LDR r5,[sp,#0x108] @III Loads pu1_avail
+ LDRB r5,[r5,#3] @III pu1_avail[3]
+ CMP r5,#0 @III
+ SUBNE r11,r4,#4 @III pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_2:
+ LDRH r5,[r11,#2] @III
+ VCLT.U8 Q12,Q6,Q14 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD r11,r0,r1 @III
+
+ LDRB r9,[r11,#14] @III pu1_src_cpy[14]
+ VMOV.16 D19[3],r5 @III vsetq_lane_u8
+ LDRB r8,[r11,#15] @III pu1_src_cpy[15]
+
+ LDRB r11,[r0,#16] @III load the value pu1_src_cpy[16 - src_strd]
+ VSUB.U8 Q12,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ LDRB r10,[r0,#17] @III load the value pu1_src_cpy[17 - src_strd]
+
+ SUB r9,r9,r11 @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+ VEXT.8 Q9,Q9,Q15,#14 @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ SUB r10,r8,r10 @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ CMP r9,#0 @III
+ VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up)
+ MVNLT r9,#0 @III
+
+ MOVGT r9,#1 @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+ VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down)
+ CMP r10,#0 @III
+
+ VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down)
+ VTBL.8 D26,{D21},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ MVNLT r10,#0 @III
+ MOVGT r10,#1 @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ VEXT.8 Q7,Q7,Q7,#2 @II sign_up = vextq_s8(sign_up, sign_up, 2)
+ VTBL.8 D27,{D21},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VCGT.U8 Q11,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VMOV.8 D15[6],r9 @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VMOV.8 D15[7],r10 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ VUZP.8 D26,D27 @II
+
+ VCLT.U8 Q10,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VTBL.8 D24,{D6},D26 @II
+ VSUB.U8 Q11,Q10,Q11 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up)
+ VTBL.8 D25,{D7},D27 @II
+ VADD.I8 Q9,Q9,Q11 @III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VZIP.8 D24,D25 @II
+
+ VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VTBL.8 D18,{D20},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VNEG.S8 Q7,Q11 @III sign_up = vnegq_s8(sign_down)
+
+ VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VTBL.8 D19,{D20},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#2 @III sign_up = vextq_s8(sign_up, sign_up, 2)
+
+ VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VUZP.8 D18,D19 @III
+
+ VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VTBL.8 D22,{D6},D18 @III
+ VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VTBL.8 D23,{D7},D19 @III
+ VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VMOVL.U8 Q9,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VZIP.8 D22,D23 @III
+
+ VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0])
+ VADDW.S8 Q10,Q10,D22 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMOV Q6,Q15 @III pu1_cur_row = pu1_next_row
+ VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1
+ VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ CMP r7,#1 @III
+
+ VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1])
+ VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VADDW.S8 Q9,Q9,D23 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMAX.S16 Q9,Q9,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ VMIN.U16 Q9,Q9,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+
+ ADD r11,r0,r1,LSL #1 @*pu1_src + src_strd
+ VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUB r5,r12,r7 @ht_tmp - row
+
+ ADD r8,r14,r5,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2]
+ VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1])
+ CMP r7,#1
+
+ LDRB r4,[r0,#16] @load the value pu1_src_cpy[16 - src_strd]
+ VLD1.8 D16,[r11]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r11] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r11,#8
+ LDRB r9,[r0,#17] @load the value pu1_src_cpy[17 - src_strd]
+
+ BNE NEXT_ROW_POINTER_ASSIGNED_3
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDRB r5,[r5,#3] @pu1_avail[3]
+ CMP r5,#0
+ SUBNE r8,r11,#4 @pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+ LDRH r5,[r8,#2]
+ VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ LDRB r8,[r0,#14] @pu1_src_cpy[14]
+
+ SUB r8,r8,r4 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+ VMOV.16 D19[3],r5 @vsetq_lane_u8
+ LDRB r10,[r0,#15] @pu1_src_cpy[15]
+
+ CMP r8,#0
+ VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ SUB r10,r10,r9 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MVNLT r8,#0
+ VLD1.8 D28,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP r10,#0
+ VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ MVNLT r10,#0
+
+ MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q11,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q9,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q9,Q9,Q11 @edge_idx = vaddq_s8(edge_idx, sign_down)
+ VTBL.8 D18,{D28},D18 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ VTBL.8 D19,{D28},D19 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ VAND Q9,Q9,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VUZP.8 D18,D19
+
+ VTBL.8 D22,{D6},D18
+ VTBL.8 D23,{D7},D19
+
+ VMOVL.U8 Q9,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VZIP.8 D22,D23
+
+ VADDW.S8 Q10,Q10,D22 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VADDW.S8 Q9,Q9,D23 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ VMAX.S16 Q9,Q9,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+
+ LDR r8,[sp,#0x118] @Loads ht
+ VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0])
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+
+ LSL r8,r8,#1
+ VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1])
+ LDR r11,[sp,#0x104] @Loads *pu1_src_left
+
+SRC_LEFT_LOOP:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ SUBS r8,r8,#4
+ STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUBS r6,r6,#16 @Decrement the wd loop count by 16
+ VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ CMP r6,#8 @Check whether residue remains
+
+ BLT RE_ASSINING_LOOP @Jump to re-assigning loop
+ LDR r7,[sp,#0x114] @Loads wd
+ LDR r0,[sp,#0x02] @Loads *pu1_src
+ SUB r7,r7,r6
+ ADD r0,r0,r7
+ BGT WIDTH_LOOP_16 @If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE @If residue remains jump to residue loop
+
+WD_16_HT_4_LOOP:
+ LDR r7,[sp,#0x114] @Loads wd
+
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ CMP r6,r7 @col == wd
+
+ LDREQB r8,[r5] @pu1_avail[0]
+ MOVNE r8,#-1
+ VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP r6,#16 @if(col == 16)
+ VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB r8,[r5,#1] @pu1_avail[1]
+ VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB r11,[r5,#2] @pu1_avail[2]
+ SUBEQ r8,r0,r1 @pu1_src - src_strd
+
+ CMP r11,#0
+ MOVNE r8,r3
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+ ADD r8,r8,#2 @pu1_src - src_strd + 2
+
+ ADD r3,r3,#16
+ VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ SUB r8,#8
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+
+ LDR r4,[sp,#0x118] @Loads ht
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+ LDR r7,[sp,#0x114] @Loads wd
+
+ SUB r7,r7,r6 @(wd - col)
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD r7,r7,#14 @15 + (wd - col)
+
+ LDR r8,[sp,#0x100] @Loads *pu1_src
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRH r8,[r7] @load the value and increment by src_strd
+ SUBS r4,r4,#1 @decrement the loop count
+
+ STRH r8,[r5],#2 @store it in the stack pointer
+ ADD r7,r7,r1
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ VMOV.I8 Q9,#0
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ ADD r9,r0,r1 @*pu1_src + src_strd
+
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r9,#8
+ LDRB r5,[r5,#3] @pu1_avail[3]
+
+ SUB r11,r12,r7 @ht_tmp - row
+ ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2]
+ ADD r8,r8,#2 @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+ CMP r5,#0
+ BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+ CMP r7,#1
+ SUBEQ r8,r9,#2 @pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+ LDRH r5,[r8]
+ VMOV.16 D19[3],r5 @vsetq_lane_u8
+ VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+ CMP r7,r12
+ BLT SIGN_UP_CHANGE_WD_16_HT_4
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB r8,[r0,#14] @pu1_src_cpy[14]
+ SUB r9,r0,r1
+
+ LDRB r5,[r9,#16] @load the value pu1_src_cpy[16 - src_strd]
+
+ LDRB r10,[r0,#15] @pu1_src_cpy[15]
+ SUB r8,r8,r5 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ LDRB r11,[r9,#17] @load the value pu1_src_cpy[17 - src_strd]
+ CMP r8,#0
+
+ MVNLT r8,#0
+ SUB r10,r10,r11 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP r10,#0
+ VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ MVNLT r10,#0
+
+ MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VTBL.8 D26,{D20},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ VTBL.8 D27,{D20},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#2 @sign_up = vextq_s8(sign_up, sign_up, 2)
+
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+
+ VUZP.8 D26,D27
+ VTBL.8 D24,{D6},D26
+ VTBL.8 D25,{D7},D27
+ VZIP.8 D24,D25
+
+ VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+ VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
+ VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ LDR r8,[sp,#0x118] @Loads ht
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+ LDR r11,[sp,#0x104] @Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ SUBS r8,r8,#2
+ STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+ SUBS r6,r6,#16 @Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP @Jump to re-assigning loop
+ BGT WD_16_HT_4_LOOP @If not equal jump to width_loop
+
+WIDTH_RESIDUE:
+ LDR r7,[sp,#0x114] @Loads wd
+
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ CMP r6,r7 @wd_residue == wd
+
+ LDREQB r8,[r5] @pu1_avail[0]
+
+ MOVNE r8,#-1
+ LDRB r11,[r5,#1] @pu1_avail[1]
+
+ LDRB r9,[r5,#2] @pu1_avail[2]
+ VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ CMP r9,#0
+
+ SUBEQ r10,r0,r1 @pu1_src - src_strd
+ VMOV.8 d8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ MOVNE r10,r3
+
+ ADD r10,r10,#2 @pu1_src - src_strd + 2
+ VMOV.8 d8[6],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+
+ LDR r4,[sp,#0x118] @Loads ht
+ VMOV.8 d8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ LDR r7,[sp,#0x114] @Loads wd
+
+ LDR r8,[sp,#0x100] @Loads *pu1_src
+ VLD1.8 D10,[r10]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ VLD1.8 D11,[r10] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ SUB r10,#8
+ SUB r7,r7,#2 @(wd - 2)
+
+ ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRH r8,[r7] @load the value and increment by src_strd
+ ADD r7,r7,r1
+ STRH r8,[r5],#2 @store it in the stack pointer
+ SUBS r4,r4,#1 @decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+ VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src)
+ VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src)
+ SUB r0,#8
+
+ VMOV.I8 Q9,#0
+ VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row)
+ VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV r7,r12 @row count, move ht_tmp to r7
+
+PU1_SRC_LOOP_RESIDUE:
+ ADD r9,r0,r1 @*pu1_src + src_strd
+
+ SUB r11,r12,r7 @ht_tmp - row
+ VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB r9,#8
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+
+ LDRB r5,[r5,#3] @pu1_avail[3]
+ ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ CMP r5,#0
+ ADD r8,r8,#2 @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+ BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+ CMP r7,#1
+ SUBEQ r8,r9,#2 @pu1_src[src_strd - 2]
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+ LDRB r5,[r8]
+
+ LDRB r8,[r8,#1]
+ VMOV.8 D19[6],r5 @vsetq_lane_u8
+ CMP r7,r12
+
+ VMOV.8 D19[7],r8 @vsetq_lane_u8
+ VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+ BLT SIGN_UP_CHANGE_RESIDUE
+ LDR r5,[sp,#0x108] @Loads pu1_avail
+ LDRB r5,[r5,#2] @pu1_avail[2]
+ CMP r5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB r8,[r0,#14] @pu1_src_cpy[14]
+ SUB r9,r0,r1
+
+ LDRB r5,[r9,#16] @load the value pu1_src_cpy[16 - src_strd]
+
+ LDRB r10,[r0,#15] @pu1_src_cpy[15]
+ SUB r8,r8,r5 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ LDRB r11,[r9,#17] @load the value pu1_src_cpy[17 - src_strd]
+ CMP r8,#0
+
+ MVNLT r8,#0
+ SUB r10,r10,r11 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP r10,#0
+ VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ MVNLT r10,#0
+
+ MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up)
+ VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down)
+ VTBL.8 D26,{D20},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ VTBL.8 D27,{D20},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ VEXT.8 Q7,Q7,Q7,#2 @sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask)
+
+
+ VUZP.8 D26,D27
+ VTBL.8 D24,{D6},D26
+ VTBL.8 D25,{D7},D27
+ VZIP.8 D24,D25
+
+ VMOV Q6,Q8 @pu1_cur_row = pu1_next_row
+ VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1
+ VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP
+
+ LDR r8,[sp,#0x118] @Loads ht
+ ADD r5,sp,#0x4B @*au1_src_left_tmp
+
+ LDR r11,[sp,#0x104] @Loads *pu1_src_left
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDR r7,[r5],#4 @au1_src_left_tmp[row]
+ SUBS r8,r8,#2
+ STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ LDR r7,[sp,#0x114] @Loads wd
+ LDR r8,[sp,#0x118] @Loads ht
+
+ LDR r0,[sp,#0x100] @Loads *pu1_src
+ SUB r10,r7,#2 @wd - 2
+
+ LDRH r9,[sp,#6]
+ SUB r8,r8,#1 @ht - 1
+
+ STRH r9,[r0,r10] @pu1_src_org[0] = u1_pos_0_0_tmp
+ MLA r6,r8,r1,r0 @pu1_src[(ht - 1) * src_strd]
+
+ LDR r4,[sp,#0xFC] @Loads pu1_src_top_left
+
+ LDRH r9,[sp,#8]
+ ADD r12,sp,#10
+
+ STRH r9,[r6] @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+ LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer
+ STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp
+ LDR r3,[sp,#0x10C] @Loads pu1_src_top
+
+SRC_TOP_LOOP:
+ VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS r7,r7,#8 @Decrement the width
+ VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0xD4
+ LDMFD sp!,{r4-r12,r15} @Reload the registers from SP
+
+
+
diff --git a/common/arm/ihevc_weighted_pred_bi.s b/common/arm/ihevc_weighted_pred_bi.s
new file mode 100644
index 0000000..5308423
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_bi.s
@@ -0,0 +1,269 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_weighted_pred_bi.s
+@*
+@* @brief
+@* contains function definitions for weighted prediction used in inter
+@* prediction
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@* - ihevc_weighted_pred_bi()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* does bi-weighted prediction on the arrays pointed by pi2_src1 and
+@* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the
+@* function is optimized considering the fact width and height are multiple
+@* of 2.
+@*
+@* @par description:
+@* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+@* off1 + 1) << (shift - 1) ) >> shift
+@*
+@* @param[in] pi2_src1
+@* pointer to source 1
+@*
+@* @param[in] pi2_src2
+@* pointer to source 2
+@*
+@* @param[out] pu1_dst
+@* pointer to destination
+@*
+@* @param[in] src_strd1
+@* source stride 1
+@*
+@* @param[in] src_strd2
+@* source stride 2
+@*
+@* @param[in] dst_strd
+@* destination stride
+@*
+@* @param[in] wgt0
+@* weight to be multiplied to source 1
+@*
+@* @param[in] off0
+@* offset 0
+@*
+@* @param[in] wgt1
+@* weight to be multiplied to source 2
+@*
+@* @param[in] off1
+@* offset 1
+@*
+@* @param[in] shift
+@* (14 bit depth) + log2_weight_denominator
+@*
+@* @param[in] lvl_shift1
+@* added before shift and offset
+@*
+@* @param[in] lvl_shift2
+@* added before shift and offset
+@*
+@* @param[in] ht
+@* height of the source
+@*
+@* @param[in] wd
+@* width of the source
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_weighted_pred_bi(word16 *pi2_src1,
+@ word16 *pi2_src2,
+@ uword8 *pu1_dst,
+@ word32 src_strd1,
+@ word32 src_strd2,
+@ word32 dst_strd,
+@ word32 wgt0,
+@ word32 off0,
+@ word32 wgt1,
+@ word32 off1,
+@ word32 shift,
+@ word32 lvl_shift1,
+@ word32 lvl_shift2,
+@ word32 ht,
+@ word32 wd)
+
+@**************variables vs registers*****************************************
+@ r0 => *pi2_src1
+@ r1 => *pi2_src2
+@ r2 => *pu1_dst
+@ r3 => src_strd1
+@ r4 => src_strd2
+@ r5 => dst_strd
+@ r6 => wgt0
+@ r7 => off0
+@ r8 => wgt1
+@ r9 => off1
+@ r10 => shift
+@ r11 => lvl_shift1
+@ r12 => lvl_shift2
+@ r14 => ht
+@ r7 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_weighted_pred_bi_a9q
+
+.type ihevc_weighted_pred_bi_a9q, %function
+
+ihevc_weighted_pred_bi_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r6,[sp,#48] @load wgt0
+ ldr r11,[sp,#68] @load lvl_shift1
+ ldr r12,[sp,#72] @load lvl_shift2
+ vmov.s16 d7[0],r6 @moved for scalar multiplication
+ mul r4,r11,r6 @lvl_shift1 * wgt0
+ ldr r8,[sp,#56] @load wgt1
+ ldr r7,[sp,#52] @load off0
+ vmov.s16 d7[1],r8 @moved for scalar multiplication
+ mla r4,r12,r8,r4 @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
+ ldr r9,[sp,#60] @load off1
+ add r5,r7,r9 @off0 + off1
+ ldr r10,[sp,#64] @load shift
+ add r5,r5,#1 @off0 + off1 + 1
+ sub r14,r10,#1 @shift - 1
+ ldr r7,[sp,#80] @load wd
+ lsl r5,r5,r14 @((off0 + off1 + 1) << (shift - 1))
+ vdup.u32 q14,r10 @vmovq_n_s32(0-shift)
+ add r4,r4,r5 @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
+ vdup.u32 q15,r4 @vmovq_n_s32(tmp_lvl_shift)
+ vneg.s32 q14,q14
+ ldr r4,[sp,#40] @load src_strd2
+ lsl r9,r7,#1
+ ldr r5,[sp,#44] @load dst_strd
+ lsl r3,r3,#1
+ ldr r14,[sp,#76] @load ht
+ lsl r4,r4,#1
+
+ cmp r14,#0 @check ht == 0
+ beq end_loops @if equal, then end the function
+
+outer_loop:
+ cmp r7,#0 @check wd == 0
+ beq end_loops @if equal, then end the function
+
+core_loop:
+ add r6,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r8,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ vld1.s16 {d0},[r0]! @load and increment the pi2_src1
+ add r10,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd
+ vld1.s16 {d1},[r1]! @load and increment the pi2_src2
+ vmull.s16 q2,d0,d7[0] @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
+ vld1.s16 {d2},[r6],r3 @load and increment the pi2_src_tmp1 ii iteration
+ vmull.s16 q4,d1,d7[1] @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
+ vld1.s16 {d3},[r8],r4 @load and increment the pi2_src_tmp1 ii iteration
+ vadd.s32 q2,q2,q4 @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
+
+ vld1.s16 {d0},[r6],r3 @load and increment the pi2_src1 iii iteration
+ vmull.s16 q5,d2,d7[0] @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
+
+ vld1.s16 {d1},[r8],r4 @load and increment the pi2_src2 iii iteration
+ vadd.s32 q2,q2,q15 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ vmull.s16 q7,d0,d7[0] @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
+
+ vld1.s16 {d2},[r6],r3 @load and increment the pi2_src_tmp1 iv iteration
+ vmull.s16 q6,d3,d7[1] @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
+ vshl.s32 q2,q2,q14 @vshlq_s32(i4_tmp1_t1, tmp_shift_t)
+
+ vld1.s16 {d3},[r8],r4 @load and increment the pi2_src_tmp1 iv iteration
+ vadd.s32 q5,q5,q6 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
+
+ vqmovun.s32 d4,q2 @vqmovun_s32(sto_res_tmp1)
+ vmull.s16 q8,d1,d7[1] @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
+
+ vadd.s32 q5,q5,q15 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
+ vmov.s32 d5,d4 @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+ vadd.s32 q7,q7,q8 @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
+
+ vshl.s32 q5,q5,q14 @vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
+ vmull.s16 q9,d2,d7[0] @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
+ vqmovn.u16 d4,q2 @vqmovn_u16(sto_res_tmp3)
+ vadd.s32 q7,q7,q15 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+
+ vqmovun.s32 d10,q5 @vqmovun_s32(sto_res_tmp1) ii iteration
+ vmull.s16 q10,d3,d7[1] @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
+
+ vshl.s32 q7,q7,q14 @vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
+ vmov.s32 d11,d10 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+ vadd.s32 q9,q9,q10 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+ vqmovun.s32 d14,q7 @vqmovun_s32(sto_res_tmp1) iii iteration
+
+ vadd.s32 q9,q9,q15 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
+ vst1.s32 {d4[0]},[r2]! @store pu1_dst i iteration
+
+ vqmovn.u16 d10,q5 @vqmovn_u16(sto_res_tmp3) ii iteration
+ vshl.s32 q9,q9,q14 @vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
+ vst1.s32 {d10[0]},[r10],r5 @store pu1_dst ii iteration
+
+
+ vmov.s32 d15,d14 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+ vqmovn.u16 d14,q7 @vqmovn_u16(sto_res_tmp3) iii iteration
+ vqmovun.s32 d18,q9 @vqmovun_s32(sto_res_tmp1) iv iteration
+ vmov.s32 d19,d18 @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+ vst1.s32 {d14[0]},[r10],r5 @store pu1_dst iii iteration
+ vqmovn.u16 d18,q9 @vqmovn_u16(sto_res_tmp3) iv iteration
+ subs r7,r7,#4 @decrement wd by 4 and check for 0
+ vst1.s32 {d18[0]},[r10],r5 @store pu1_dst iv iteration
+
+ bgt core_loop @if greater than 0 repeat the core loop again
+
+end_core_loop:
+ rsb r11,r9,r3,lsl #2 @2*src_strd1 - wd
+ subs r14,r14,#4 @decrement the ht by 4
+ rsb r12,r9,r4,lsl #2 @2*src_strd2 - wd
+ add r0,r0,r11 @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr r7,r9,#1
+ add r1,r1,r12 @pi2_src2 + 4*src_strd2 - 2*wd
+ rsb r10,r7,r5,lsl #2 @2*dst_strd - wd
+ add r2,r2,r10 @pu1_dst + dst_std - wd
+ bgt core_loop @if ht is greater than 0 goto outer_loop
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
+
+
diff --git a/common/arm/ihevc_weighted_pred_bi_default.s b/common/arm/ihevc_weighted_pred_bi_default.s
new file mode 100644
index 0000000..b560c15
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_bi_default.s
@@ -0,0 +1,494 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_weighted_pred_bi_default.s
+@*
+@* @brief
+@* contains function definitions for weighted prediction used in inter
+@* prediction
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@* - ihevc_weighted_pred_bi_default()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+@* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the
+@* function is optimized considering the fact width and height are multiple
+@* of 2.
+@*
+@* @par description:
+@* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+@* >> shift where shift = 15 - bitdepth
+@*
+@* @param[in] pi2_src1
+@* pointer to source 1
+@*
+@* @param[in] pi2_src2
+@* pointer to source 2
+@*
+@* @param[out] pu1_dst
+@* pointer to destination
+@*
+@* @param[in] src_strd1
+@* source stride 1
+@*
+@* @param[in] src_strd2
+@* source stride 2
+@*
+@* @param[in] dst_strd
+@* destination stride
+@*
+@* @param[in] lvl_shift1
+@* added before shift and offset
+@*
+@* @param[in] lvl_shift2
+@* added before shift and offset
+@*
+@* @param[in] ht
+@* height of the source
+@*
+@* @param[in] wd
+@* width of the source
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+@void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
+@ word16 *pi2_src2,
+@ uword8 *pu1_dst,
+@ word32 src_strd1,
+@ word32 src_strd2,
+@ word32 dst_strd,
+@ word32 lvl_shift1,
+@ word32 lvl_shift2,
+@ word32 ht,
+@ word32 wd)
+
+@**************variables vs registers*****************************************
+@ r0 => *pi2_src1
+@ r1 => *pi2_src2
+@ r2 => *pu1_dst
+@ r3 => src_strd1
+@ r4 => src_strd2
+@ r5 => dst_strd
+@ r6 => lvl_shift1
+@ r7 => lvl_shift2
+@ r8 => ht
+@ r9 => wd
+.text
+.align 4
+
+
+
+
+.globl ihevc_weighted_pred_bi_default_a9q
+
+.type ihevc_weighted_pred_bi_default_a9q, %function
+
+ihevc_weighted_pred_bi_default_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+ ldr r4,[sp,#40] @load src_strd2
+ lsl r3,r3,#1
+ ldr r5,[sp,#44] @load dst_strd
+ ldr r6,[sp,#48] @load lvl_shift1
+ lsl r4,r4,#1
+ ldr r7,[sp,#52] @load lvl_shift2
+ ldr r8,[sp,#56] @load ht
+ ldr r9,[sp,#60] @load wd
+ vdup.16 q2,r6 @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
+ vdup.16 q3,r7 @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
+ vmov.i16 q0,#0x40 @tmp_lvl_shift = 1 << (shift - 1)
+ vadd.i16 q2,q3
+ vadd.s16 q0,q0,q2
+@ vmvn.i32 q1,#0x6 @vmovq_n_s32(tmp_shift)
+ lsl r6,r9,#1
+ rsb r7,r6,r3,lsl #2 @4*src_strd1 - wd
+ rsb r10,r6,r4,lsl #2 @4*src_strd2 - wd
+ @asr r6,#1
+ @rsb r6,r6,r5,lsl #2 @4*dst_strd - wd
+
+ cmp r8,#0 @check ht == 0
+ beq end_loops @if equal, then end the function
+
+chroma_decision:
+ orr r14,r8,r9
+ cmp r14,#10
+ beq outer_loop_chroma_8x2
+
+ cmp r14,#6
+ beq outer_loop_chroma_4x2
+
+
+luma_decision:
+ cmp r9,#24
+ beq outer_loop_8
+
+ cmp r9,#16
+ bge outer_loop_16
+
+ cmp r9,#12
+ beq outer_loop_4
+
+ cmp r9,#8
+ bge outer_loop_8
+
+
+
+
+
+
+outer_loop_4:
+ cmp r9,#0 @check wd == 0
+ beq end_loops @if equal, then end the function
+
+core_loop_4:
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ vld1.s16 {d6},[r0]! @load and increment the pi2_src1
+ add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd
+ vld1.s16 {d7},[r1]! @load and increment the pi2_src2
+ vld1.s16 {d8},[r11],r3 @load and increment the pi2_src1 ii iteration
+ vqadd.s16 d18,d6,d7
+ vqadd.s16 d18,d18,d0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ vld1.s16 {d9},[r12],r4 @load and increment the pi2_src2 ii iteration
+ vqadd.s16 d20,d8,d9 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ vqadd.s16 d19,d20,d0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ vqshrun.s16 d20,q9,#7
+ vld1.s16 {d22},[r11],r3 @load and increment the pi2_src1 iii iteration
+ vld1.s16 {d23},[r12],r4 @load and increment the pi2_src2 iii iteration
+ vqadd.s16 d30,d22,d23
+ vqadd.s16 d30,d30,d0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+ vld1.s16 {d24},[r11],r3 @load and increment the pi2_src1 iv iteration
+ vld1.s16 {d25},[r12],r4 @load and increment the pi2_src2 iv iteration
+ vqadd.s16 d18,d24,d25 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+ vqadd.s16 d31,d18,d0
+ vst1.32 {d20[0]},[r2]! @store pu1_dst i iteration
+ vst1.32 {d20[1]},[r14],r5 @store pu1_dst ii iteration
+ vqshrun.s16 d30,q15,#7
+ vst1.32 {d30[0]},[r14],r5 @store pu1_dst iii iteration @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+ subs r9,r9,#4 @decrement wd by 4 and check for 0
+ vst1.32 {d30[1]},[r14],r5 @store pu1_dst iv iteration
+ bgt core_loop_4 @if greater than 0 repeat the core loop again
+
+end_core_loop_4:
+
+ subs r8,r8,#4 @decrement the ht by 4
+
+ add r0,r0,r7 @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr r9,r6,#1
+ add r1,r1,r10 @pi2_src2 + 4*src_strd2 - 2*wd
+ rsb r14,r9,r5,lsl #2 @4*dst_strd - wd
+ add r2,r2,r14
+ @pu1_dst + dst_std - wd
+ bgt core_loop_4 @if ht is greater than 0 goto outer_loop
+
+ b end_loops
+
+
+@ this is only for chroma module with input 2x2
+outer_loop_chroma_4x2:
+ cmp r9,#0 @check wd == 0
+ beq end_loops @if equal, then end the function
+ rsb r7,r6,r3,lsl #1 @2*src_strd1 - wd
+ rsb r10,r6,r4,lsl #1 @2*src_strd2 - wd
+core_loop_chroma_4x2:
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ vld1.s16 {d6},[r0]! @load and increment the pi2_src1
+ add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd
+ vld1.s16 {d7},[r1]! @load and increment the pi2_src2
+ vld1.s16 {d8},[r11],r3 @load and increment the pi2_src1 ii iteration
+ vqadd.s16 d18,d6,d7
+ vqadd.s16 d18,d18,d0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ vld1.s16 {d9},[r12],r4 @load and increment the pi2_src2 ii iteration
+ vqadd.s16 d20,d8,d9 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ vqadd.s16 d19,d20,d0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ vqshrun.s16 d20,q9,#7
+ vst1.32 {d20[0]},[r2]! @store pu1_dst i iteration
+ vst1.32 {d20[1]},[r14],r5 @store pu1_dst ii iteration
+
+ subs r9,r9,#4 @decrement wd by 4 and check for 0
+
+ bgt core_loop_chroma_4x2 @if greater than 0 repeat the core loop again
+
+end_core_loop_chorma_4x2:
+
+ subs r8,r8,#2 @decrement the ht by 4
+
+ add r0,r0,r7 @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr r9,r6,#1
+ add r1,r1,r10 @pi2_src2 + 2*src_strd2 - 2*wd
+ rsb r14,r9,r5,lsl #1 @2*dst_strd - wd
+ add r2,r2,r14
+ @pu1_dst + dst_std - wd
+ bgt core_loop_chroma_4x2 @if ht is greater than 0 goto outer_loop
+
+ b end_loops
+
+
+
+outer_loop_8:
+ cmp r9,#0 @check wd == 0
+ beq end_loops @if equal, then end the function
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+core_loop_8:
+
+ vld1.s16 {q12},[r0]! @load and increment the pi2_src1
+ add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd
+ vld1.s16 {q13},[r1]! @load and increment the pi2_src2
+ vqadd.s16 q12,q12,q13
+ vld1.s16 {q14},[r11],r3 @load and increment the pi2_src1 ii iteration
+ vqadd.s16 q12,q12,q0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ vld1.s16 {q15},[r12],r4 @load and increment the pi2_src2 ii iteration
+ vld1.s16 {q8},[r11],r3 @load and increment the pi2_src1 iii iteration
+ vqadd.s16 q11,q14,q15 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ vld1.s16 {q9},[r12],r4 @load and increment the pi2_src2 iii iteration
+ vqadd.s16 q11,q11,q0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ vqshrun.s16 d20,q12,#7
+ vld1.s16 {q6},[r11],r3 @load and increment the pi2_src1 iv iteration
+ vqadd.s16 q15,q8,q9
+ vqshrun.s16 d21,q11,#7
+ vld1.s16 {q7},[r12],r4 @load and increment the pi2_src2 iv iteration
+ vqadd.s16 q15,q15,q0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+ vst1.32 {d20},[r2]! @store pu1_dst i iteration
+ vqadd.s16 q4,q6,q7 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+ vst1.32 {d21},[r14],r5 @store pu1_dst ii iteration
+ vqadd.s16 q4,q4,q0
+ vqshrun.s16 d30,q15,#7
+ vqshrun.s16 d31,q4,#7
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ vst1.32 {d30},[r14],r5 @store pu1_dst iii iteration @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+ subs r9,r9,#8 @decrement wd by 4 and check for 0
+ vst1.32 {d31},[r14],r5 @store pu1_dst iv iteration
+ bgt core_loop_8 @if greater than 0 repeat the core loop again
+
+end_core_loop_8:
+
+ subs r8,r8,#4 @decrement the ht by 4
+
+ add r0,r0,r7 @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr r9,r6,#1
+ add r1,r1,r10 @pi2_src2 + 4*src_strd2 - 2*wd
+ rsb r14,r9,r5,lsl #2 @4*dst_strd - wd
+ add r2,r2,r14
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) @pu1_dst + dst_std - wd
+
+ bgt core_loop_8
+ b end_loops
+
+
+
+@ this is only for chroma module with inpput 4x2
+outer_loop_chroma_8x2:
+ cmp r9,#0 @check wd == 0
+ beq end_loops @if equal, then end the function
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ rsb r7,r6,r3,lsl #1 @2*src_strd1 - wd
+ rsb r10,r6,r4,lsl #1 @2*src_strd2 - wd
+core_loop_chroma_8x2:
+
+ vld1.s16 {q12},[r0]! @load and increment the pi2_src1
+ add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd
+ vld1.s16 {q13},[r1]! @load and increment the pi2_src2
+ vqadd.s16 q12,q12,q13
+ vld1.s16 {q14},[r11],r3 @load and increment the pi2_src1 ii iteration
+ vqadd.s16 q12,q12,q0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ vld1.s16 {q15},[r12],r4 @load and increment the pi2_src2 ii iteration
+ vld1.s16 {q8},[r11],r3 @load and increment the pi2_src1 iii iteration
+ vqadd.s16 q11,q14,q15 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ vqadd.s16 q11,q11,q0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ vqshrun.s16 d20,q12,#7
+ vqshrun.s16 d21,q11,#7
+ vst1.32 {d20},[r2]! @store pu1_dst i iteration
+ vst1.32 {d21},[r14],r5 @store pu1_dst ii iteration
+
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+ subs r9,r9,#8 @decrement wd by 4 and check for 0
+
+ bgt core_loop_chroma_8x2 @if greater than 0 repeat the core loop again
+
+end_core_loop_chroma_8x2:
+
+ subs r8,r8,#2 @decrement the ht by 4
+
+ add r0,r0,r7 @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr r9,r6,#1
+ add r1,r1,r10 @pi2_src2 + 4*src_strd2 - 2*wd
+ rsb r14,r9,r5,lsl #1 @4*dst_strd - wd
+ add r2,r2,r14
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) @pu1_dst + dst_std - wd
+
+ bgt core_loop_chroma_8x2
+
+ b end_loops
+
+
+
+
+outer_loop_16:
+ cmp r9,#0 @check wd == 0
+ beq end_loops @if equal, then end the function
+ add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ rsb r7,r6,r3,lsl #1 @2*src_strd1 - wd
+ mov r14,#16
+ sub r10,r14,r5
+ sub r11,r3,r14
+ sub r12,r14,r3
+
+ rsb r14,r9,r5,lsl #1 @2*dst_strd - wd
+
+
+
+prolog_16:
+
+
+ vld1.s16 {q1},[r0]! @load and increment the pi2_src1
+ vld1.s16 {q2},[r1]! @load and increment the pi2_src2
+ vld1.s16 {q5},[r0],r11 @load and increment the pi2_src1
+ vld1.s16 {q6},[r1],r11 @load and increment the pi2_src2
+ vld1.s16 {q3},[r0]! @load and increment the pi2_src1 ii iteration
+ subs r9,r9,#16
+ vld1.s16 {q4},[r1]! @load and increment the pi2_src2 ii iteration
+ subeq r8,r8,#2
+ vqadd.s16 q11,q1,q2
+ vld1.s16 {q7},[r0],r12 @load and increment the pi2_src1 ii iteration
+ vqadd.s16 q14,q5,q6
+ vld1.s16 {q8},[r1],r12 @load and increment the pi2_src2 ii iteration
+ addeq r0,r0,r7
+ addeq r1,r1,r7
+ vqadd.s16 q12,q3,q4
+ vld1.s16 {q1},[r0]!
+ vqadd.s16 q13,q7,q8
+@ if the input is chroma with 8x2 block size
+ cmp r8,#0
+ beq epilog_16
+
+ vld1.s16 {q2},[r1]! @load and increment the pi2_src2
+ vqadd.s16 q11,q11,q0
+ vld1.s16 {q5},[r0],r11 @load and increment the pi2_src1
+ vqadd.s16 q14,q14,q0
+ vld1.s16 {q6},[r1],r11 @load and increment the pi2_src2
+ vqadd.s16 q12,q12,q0
+ vld1.s16 {q3},[r0]! @load and increment the pi2_src1 ii iteration
+ vqadd.s16 q15,q13,q0
+ vqshrun.s16 d20,q11,#7
+ vld1.s16 {q4},[r1]! @load and increment the pi2_src2 ii iteration
+ vqshrun.s16 d21,q14,#7
+ vld1.s16 {q7},[r0],r12 @load and increment the pi2_src1 ii iteration
+ vqshrun.s16 d26,q12,#7
+ vld1.s16 {q8},[r1],r12 @load and increment the pi2_src2 ii iteration
+ vqshrun.s16 d27,q15,#7
+
+
+
+core_loop_16:
+
+ cmp r9,#0
+ vqadd.s16 q11,q1,q2
+ asreq r9,r6,#1
+ vst1.32 {q10},[r2],r5
+ vqadd.s16 q14,q5,q6
+ vst1.32 {q13},[r2],r10
+ addeq r2,r2,r14
+ vqadd.s16 q12,q3,q4
+ subs r9,r9,#16
+ addeq r0,r0,r7
+ vqadd.s16 q13,q7,q8
+
+ addeq r1,r1,r7
+ subeqs r8,r8,#2 @decrement the ht by 2
+ beq epilog_16
+
+
+ vqadd.s16 q11,q11,q0
+ vld1.s16 {q1},[r0]! @load and increment the pi2_src1
+ vqadd.s16 q14,q14,q0
+ vld1.s16 {q2},[r1]! @load and increment the pi2_src2
+ vqadd.s16 q12,q12,q0
+ vld1.s16 {q5},[r0],r11 @load and increment the pi2_src1
+ vqadd.s16 q15,q13,q0
+ vld1.s16 {q6},[r1],r11 @load and increment the pi2_src2
+ vqshrun.s16 d20,q11,#7
+ vld1.s16 {q3},[r0]! @load and increment the pi2_src1 ii iteration
+ vqshrun.s16 d21,q14,#7
+ vld1.s16 {q4},[r1]! @load and increment the pi2_src2 ii iteration
+ vqshrun.s16 d26,q12,#7
+ vld1.s16 {q7},[r0],r12 @load and increment the pi2_src1 ii iteration
+ vqshrun.s16 d27,q15,#7
+ vld1.s16 {q8},[r1],r12 @load and increment the pi2_src2 ii iteration
+
+
+ b core_loop_16
+
+
+epilog_16:
+
+ vqadd.s16 q11,q11,q0
+ vqadd.s16 q14,q14,q0
+ vqadd.s16 q12,q12,q0
+ vqadd.s16 q15,q13,q0
+ vqshrun.s16 d20,q11,#7
+ vqshrun.s16 d21,q14,#7
+ vqshrun.s16 d26,q12,#7
+ vqshrun.s16 d27,q15,#7
+ vst1.32 {q10},[r2],r5
+ vst1.32 {q13},[r2]
+
+
+
+end_core_loop_16:
+
+
+
+
+
+
+
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
+
+
diff --git a/common/arm/ihevc_weighted_pred_neon_intr.c b/common/arm/ihevc_weighted_pred_neon_intr.c
new file mode 100644
index 0000000..72b5d4f
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_neon_intr.c
@@ -0,0 +1,979 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_weighted_pred_neon_intr.c
+*
+* @brief
+* Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+* Parthiban V
+*
+* @par List of Functions:
+* - ihevc_weighted_pred_uni()
+* - ihevc_weighted_pred_bi()
+* - ihevc_weighted_pred_bi_default()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_inter_pred.h"
+#include "arm_neon.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does uni-weighted prediction on the array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst Assumptions : The function is
+* optimized considering the fact Width and height are multiple of 2.
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ int16x4_t pi2_src_val1;
+ int16x4_t pi2_src_val2;
+ int32x4_t i4_tmp1_t;
+ int32x4_t i4_tmp2_t;
+ int32x4_t sto_res_tmp1;
+ uint16x4_t sto_res_tmp2;
+ uint16x8_t sto_res_tmp3;
+ uint8x8_t sto_res;
+ int32x4_t tmp_lvl_shift_t;
+ WORD32 tmp_shift = 0 - shift;
+ int32x4_t tmp_shift_t;
+ WORD16 *pi2_src_tmp;
+ UWORD8 *pu1_dst_tmp;
+
+ WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
+ tmp_lvl_shift += (1 << (shift - 1));
+ tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+ tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+ /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
+ /* height has also been unrolled, hence 2 rows will processed at a time */
+ /* store also has been taken care for two row process */
+ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
+ /* saturated and narrowed */
+
+ for(row = ht; row > 0; row -= 2)
+ {
+ for(col = wd; col > 0; col -= 4)
+ {
+ pi2_src_tmp = pi2_src + src_strd;
+
+ pu1_dst_tmp = pu1_dst + dst_strd;
+
+ pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
+ pi2_src += 4;
+
+ pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
+ i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
+
+ i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
+ i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
+ i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
+ sto_res = vqmovn_u16(sto_res_tmp3);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+ pu1_dst += 4;
+
+ sto_res = vqmovn_u16(sto_res_tmp3);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+ }
+ pi2_src += 2 * src_strd - wd;
+ pu1_dst += 2 * dst_strd - wd;
+ }
+}
+//WEIGHTED_PRED_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma uni-weighted prediction on the array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst Assumptions : The function is
+* optimized considering the fact Width and height are multiple of 2.
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ int16x4_t pi2_src_val1;
+ int16x4_t pi2_src_val2;
+ int32x4_t i4_tmp1_t;
+ int32x4_t i4_tmp2_t;
+ int32x4_t sto_res_tmp1;
+ uint16x4_t sto_res_tmp2;
+ uint16x8_t sto_res_tmp3;
+ uint8x8_t sto_res;
+ int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
+ int32x4x2_t tmp_lvl_shift_t;
+ WORD32 tmp_shift = 0 - shift;
+ int32x4_t tmp_shift_t;
+ int16x4_t tmp_wgt0_u, tmp_wgt0_v;
+ int16x4x2_t wgt0;
+ WORD16 *pi2_src_tmp;
+ UWORD8 *pu1_dst_tmp;
+
+ WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
+ tmp_lvl_shift += (1 << (shift - 1));
+ tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
+
+ tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
+ tmp_lvl_shift += (1 << (shift - 1));
+ tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
+
+ tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
+
+ tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+ tmp_wgt0_u = vdup_n_s16(wgt0_cb);
+ tmp_wgt0_v = vdup_n_s16(wgt0_cr);
+ wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
+
+ /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
+ /* height has also been unrolled, hence 2 rows will processed at a time */
+ /* store also has been taken care for two row process */
+ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
+ /* saturated and narrowed */
+
+ for(row = ht; row > 0; row -= 2)
+ {
+ for(col = 2 * wd; col > 0; col -= 4)
+ {
+ pi2_src_tmp = pi2_src + src_strd;
+
+ pu1_dst_tmp = pu1_dst + dst_strd;
+
+ pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
+ pi2_src += 4;
+
+ pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
+ i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
+
+ i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
+ i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
+ i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
+ sto_res = vqmovn_u16(sto_res_tmp3);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+ pu1_dst += 4;
+
+ sto_res = vqmovn_u16(sto_res_tmp3);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+ }
+ pi2_src += 2 * src_strd - 2 * wd;
+ pu1_dst += 2 * dst_strd - 2 * wd;
+ }
+}
+//WEIGHTED_PRED_CHROMA_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
+* function is optimized considering the fact Width and height are multiple
+* of 2.
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 wgt1,
+ WORD32 off1,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ int16x4_t pi2_src1_val1;
+ int16x4_t pi2_src1_val2;
+ int16x4_t pi2_src2_val1;
+ int16x4_t pi2_src2_val2;
+ int32x4_t i4_tmp1_t1;
+ int32x4_t i4_tmp1_t2;
+ int32x4_t i4_tmp2_t1;
+ int32x4_t i4_tmp2_t2;
+ int32x4_t sto_res_tmp1;
+ uint16x4_t sto_res_tmp2;
+ uint16x8_t sto_res_tmp3;
+ uint8x8_t sto_res;
+ int32x4_t tmp_lvl_shift_t;
+ WORD32 tmp_shift = 0 - shift;
+ int32x4_t tmp_shift_t;
+ WORD16 *pi2_src_tmp1;
+ WORD16 *pi2_src_tmp2;
+ UWORD8 *pu1_dst_tmp;
+
+ WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
+ tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
+ tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+ tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+ /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
+ /* height has also been unrolled, hence 2 rows will processed at a time */
+ /* store also has been taken care for two row process */
+ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
+ /* saturated and narrowed */
+
+ for(row = ht; row > 0; row -= 2)
+ {
+ for(col = wd; col > 0; col -= 4)
+ {
+ pi2_src_tmp1 = pi2_src1 + src_strd1;
+ pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+ pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+ pi2_src1 += 4;
+ pu1_dst_tmp = pu1_dst + dst_strd;
+
+ pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+ pi2_src2 += 4;
+ i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
+
+ pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+ i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
+
+ pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+ i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
+
+ i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
+ sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+ sto_res = vqmovn_u16(sto_res_tmp3);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+ pu1_dst += 4;
+
+ sto_res = vqmovn_u16(sto_res_tmp3);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+ }
+ pi2_src1 += 2 * src_strd1 - wd;
+ pi2_src2 += 2 * src_strd2 - wd;
+ pu1_dst += 2 * dst_strd - wd;
+ }
+}
+//WEIGHTED_PRED_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
+* function is optimized considering the fact Width and height are multiple
+* of 2.
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 wgt1_cb,
+ WORD32 wgt1_cr,
+ WORD32 off1_cb,
+ WORD32 off1_cr,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ int16x4_t pi2_src1_val1;
+ int16x4_t pi2_src1_val2;
+ int16x4_t pi2_src2_val1;
+ int16x4_t pi2_src2_val2;
+ int32x4_t i4_tmp1_t1;
+ int32x4_t i4_tmp1_t2;
+ int32x4_t i4_tmp2_t1;
+ int32x4_t i4_tmp2_t2;
+ int32x4_t sto_res_tmp1;
+ uint16x4_t sto_res_tmp2;
+ uint16x8_t sto_res_tmp3;
+ uint8x8_t sto_res;
+ int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
+ int32x4x2_t tmp_lvl_shift_t;
+ WORD32 tmp_shift = 0 - shift;
+ int32x4_t tmp_shift_t;
+ int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
+ int16x4x2_t wgt0, wgt1;
+ WORD16 *pi2_src_tmp1;
+ WORD16 *pi2_src_tmp2;
+ UWORD8 *pu1_dst_tmp;
+
+ WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
+ tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
+ tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
+
+ tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
+ tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
+ tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
+
+ tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
+
+ tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+ tmp_wgt0_u = vdup_n_s16(wgt0_cb);
+ tmp_wgt0_v = vdup_n_s16(wgt0_cr);
+ wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
+ tmp_wgt1_u = vdup_n_s16(wgt1_cb);
+ tmp_wgt1_v = vdup_n_s16(wgt1_cr);
+ wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
+
+ /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
+ /* height has also been unrolled, hence 2 rows will processed at a time */
+ /* store also has been taken care for two row process */
+ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
+ /* saturated and narrowed */
+
+ for(row = ht; row > 0; row -= 2)
+ {
+ for(col = 2 * wd; col > 0; col -= 4)
+ {
+ pi2_src_tmp1 = pi2_src1 + src_strd1;
+ pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+ pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+ pi2_src1 += 4;
+ pu1_dst_tmp = pu1_dst + dst_strd;
+
+ pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+ pi2_src2 += 4;
+ i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
+
+ pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+ i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
+
+ pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+ i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
+
+ i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
+ sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+ sto_res = vqmovn_u16(sto_res_tmp3);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+ pu1_dst += 4;
+
+ sto_res = vqmovn_u16(sto_res_tmp3);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+ }
+ pi2_src1 += 2 * src_strd1 - 2 * wd;
+ pi2_src2 += 2 * src_strd2 - 2 * wd;
+ pu1_dst += 2 * dst_strd - 2 * wd;
+ }
+}
+//WEIGHTED_PRED_CHROMA_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
+* function is optimized considering the fact Width and height are multiple
+* of 2.
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+* >> shift where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ int16x4_t pi2_src1_val1;
+ int16x4_t pi2_src1_val2;
+ int16x4_t pi2_src2_val1;
+ int16x4_t pi2_src2_val2;
+ int32x4_t i4_tmp1_t1;
+ int32x4_t i4_tmp1_t2;
+ int32x4_t i4_tmp2_t1;
+ int32x4_t i4_tmp2_t2;
+ int32x4_t sto_res_tmp1;
+ uint16x4_t sto_res_tmp2;
+ uint16x8_t sto_res_tmp3;
+ uint8x8_t sto_res;
+ int32x4_t tmp_lvl_shift_t;
+ int32x4_t tmp_shift_t;
+ WORD16 *pi2_src_tmp1;
+ WORD16 *pi2_src_tmp2;
+ UWORD8 *pu1_dst_tmp;
+ WORD32 shift;
+
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ WORD32 tmp_shift = 0 - shift;
+ WORD32 tmp_lvl_shift = 1 << (shift - 1);
+ tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+ tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+ int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
+ int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
+
+ /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
+ /* height has also been unrolled, hence 2 rows will processed at a time */
+ /* store also has been taken care for two row process */
+ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
+ /* saturated and narrowed */
+
+ for(row = ht; row > 0; row -= 2)
+ {
+ for(col = wd; col > 0; col -= 4)
+ {
+ pi2_src_tmp1 = pi2_src1 + src_strd1;
+ pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+ pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+ pi2_src1 += 4;
+ pu1_dst_tmp = pu1_dst + dst_strd;
+
+ pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+ pi2_src2 += 4;
+ i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
+
+ pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+ i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
+
+ pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+ i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
+
+ i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
+ sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+ sto_res = vqmovn_u16(sto_res_tmp3);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+ pu1_dst += 4;
+
+ sto_res = vqmovn_u16(sto_res_tmp3);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+ }
+ pi2_src1 += 2 * src_strd1 - wd;
+ pi2_src2 += 2 * src_strd2 - wd;
+ pu1_dst += 2 * dst_strd - wd;
+ }
+}
+//WEIGHTED_PRED_BI_DEFAULT
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
+* function is optimized considering the fact Width and height are multiple
+* of 2.
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+* >> shift where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ int16x4_t pi2_src1_val1;
+ int16x4_t pi2_src1_val2;
+ int16x4_t pi2_src2_val1;
+ int16x4_t pi2_src2_val2;
+ int32x4_t i4_tmp1_t1;
+ int32x4_t i4_tmp1_t2;
+ int32x4_t i4_tmp2_t1;
+ int32x4_t i4_tmp2_t2;
+ int32x4_t sto_res_tmp1;
+ uint16x4_t sto_res_tmp2;
+ uint16x8_t sto_res_tmp3;
+ uint8x8_t sto_res;
+ int32x4_t tmp_lvl_shift_t;
+ int32x4_t tmp_shift_t;
+ WORD16 *pi2_src_tmp1;
+ WORD16 *pi2_src_tmp2;
+ UWORD8 *pu1_dst_tmp;
+ WORD32 shift;
+ WORD32 tmp_shift;
+ WORD32 tmp_lvl_shift;
+ int16x4_t lvl_shift1_t;
+ int16x4_t lvl_shift2_t;
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ tmp_shift = 0 - shift;
+ tmp_lvl_shift = 1 << (shift - 1);
+ tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
+ tmp_shift_t = vmovq_n_s32(tmp_shift);
+
+ lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
+ lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
+
+ /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
+ /* height has also been unrolled, hence 2 rows will processed at a time */
+ /* store also has been taken care for two row process */
+ /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
+ /* saturated and narrowed */
+
+ for(row = ht; row > 0; row -= 2)
+ {
+ for(col = 2 * wd; col > 0; col -= 4)
+ {
+ pi2_src_tmp1 = pi2_src1 + src_strd1;
+ pi2_src_tmp2 = pi2_src2 + src_strd2;
+
+ pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
+ pi2_src1 += 4;
+ pu1_dst_tmp = pu1_dst + dst_strd;
+
+ pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
+ pi2_src2 += 4;
+ i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
+
+ pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
+ i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
+
+ pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
+
+ i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
+ i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
+
+ i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
+ sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+
+ i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
+ sto_res = vqmovn_u16(sto_res_tmp3);
+
+ sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
+ sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
+
+ vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
+ pu1_dst += 4;
+
+ sto_res = vqmovn_u16(sto_res_tmp3);
+ vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
+ }
+ pi2_src1 += 2 * src_strd1 - 2 * wd;
+ pi2_src2 += 2 * src_strd2 - 2 * wd;
+ pu1_dst += 2 * dst_strd - 2 * wd;
+ }
+}
+//WEIGHTED_PRED_CHROMA_BI_DEFAULT
diff --git a/common/arm/ihevc_weighted_pred_uni.s b/common/arm/ihevc_weighted_pred_uni.s
new file mode 100644
index 0000000..e9b69c1
--- /dev/null
+++ b/common/arm/ihevc_weighted_pred_uni.s
@@ -0,0 +1,219 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@*******************************************************************************
+@* @file
+@* ihevc_weighted_pred_uni.s
+@*
+@* @brief
+@* contains function definitions for weighted prediction used in inter
+@* prediction
+@*
+@* @author
+@* parthiban v
+@*
+@* @par list of functions:
+@* - ihevc_weighted_pred_uni()
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@/**
+@*******************************************************************************
+@*
+@* @brief
+@* does uni-weighted prediction on the array pointed by pi2_src and stores
+@* it at the location pointed by pi2_dst assumptions : the function is
+@* optimized considering the fact width and height are multiple of 2.
+@*
+@* @par description:
+@* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+@* offset
+@*
+@* @param[in] pi2_src
+@* pointer to the source
+@*
+@* @param[out] pu1_dst
+@* pointer to the destination
+@*
+@* @param[in] src_strd
+@* source stride
+@*
+@* @param[in] dst_strd
+@* destination stride
+@*
+@* @param[in] wgt0
+@* weight to be multiplied to the source
+@*
+@* @param[in] off0
+@* offset to be added after rounding and
+@*
+@* @param[in] shifting
+@*
+@*
+@* @param[in] shift
+@* (14 bit depth) + log2_weight_denominator
+@*
+@* @param[in] lvl_shift
+@* added before shift and offset
+@*
+@* @param[in] ht
+@* height of the source
+@*
+@* @param[in] wd
+@* width of the source
+@*
+@* @returns
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************
+@*/
+
+@void ihevc_weighted_pred_uni(word16 *pi2_src,
+@ uword8 *pu1_dst,
+@ word32 src_strd,
+@ word32 dst_strd,
+@ word32 wgt0,
+@ word32 off0,
+@ word32 shift,
+@ word32 lvl_shift,
+@ word32 ht,
+@ word32 wd)
+
+@**************variables vs registers*****************************************
+@ r0 => *pi2_src
+@ r1 => *pu1_dst
+@ r2 => src_strd
+@ r3 => dst_strd
+@ r4 => wgt0
+@ r5 => off0
+@ r6 => shift
+@ r7 => lvl_shift
+@ r8 => ht
+@ r9 => wd
+
+.text
+.align 4
+
+
+
+
+.globl ihevc_weighted_pred_uni_a9q
+
+.type ihevc_weighted_pred_uni_a9q, %function
+
+ihevc_weighted_pred_uni_a9q:
+
+ stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
+
+ ldr r4,[sp,#40] @load wgt0
+ ldr r7,[sp,#52] @load lvl_shift
+ mov r11,#1
+ ldr r5,[sp,#44] @load off0
+ mul r10,r7,r4 @lvl_shift * wgt0
+ ldr r6,[sp,#48] @load shift
+ ldr r8,[sp,#56] @load ht
+ add r10,r10,r5,lsl r6 @lvl_shift * wgt0 + (off0 << shift)
+ ldr r9,[sp,#60] @load wt
+ sub r12,r6,#1
+ vmov.s16 d0[0],r4 @moved for scalar multiplication
+ lsl r2,r2,#1
+ vdup.u32 q14,r6 @vmovq_n_s32(tmp_shift)
+ add r10,r10,r11,lsl r12 @tmp_lvl_shift += (1 << (shift - 1))
+ vdup.s32 q15,r10 @vmovq_n_s32(tmp_lvl_shift)
+ vneg.s32 q14,q14
+ lsl r4,r9,#1
+
+ cmp r8,#0 @check ht == 0
+ beq end_loops @if equal, then end the function
+
+outer_loop:
+ cmp r9,#0 @check wd == 0
+ beq end_loops @if equal, then end the function
+
+core_loop:
+ add r5,r0,r2 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add r6,r1,r3 @pu1_dst_tmp = pu1_dst + dst_strd
+ vld1.s16 {d1},[r0]! @load and increment the pi2_src
+ vld1.s16 {d2},[r5],r2 @load and increment the pi2_src_tmp ii iteration
+ vmull.s16 q2,d1,d0[0] @vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
+
+ vadd.i32 q2,q2,q15 @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
+ vld1.s16 {d8},[r5],r2 @load and increment the pi2_src iii iteration
+
+ vmull.s16 q3,d2,d0[0] @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
+ vld1.s16 {d9},[r5],r2 @load and increment the pi2_src_tmp iv iteration
+
+ vshl.s32 q2,q2,q14 @vshlq_s32(i4_tmp1_t, tmp_shift_t)
+ vadd.i32 q3,q3,q15 @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
+
+ vmull.s16 q5,d8,d0[0] @vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+ vqmovun.s32 d4,q2 @vqmovun_s32(sto_res_tmp1)
+
+ vadd.i32 q5,q5,q15 @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
+ vmov.s32 d5,d4 @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+
+ vshl.s32 q3,q3,q14 @vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
+
+ vmull.s16 q6,d9,d0[0] @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+ vqmovn.u16 d4,q2 @vqmovn_u16(sto_res_tmp3)
+
+ vshl.s32 q5,q5,q14 @vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
+ vqmovun.s32 d6,q3 @vqmovun_s32(sto_res_tmp1) ii iteration
+
+ vadd.i32 q6,q6,q15 @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
+ vmov.s32 d7,d6 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+ vqmovun.s32 d10,q5 @vqmovun_s32(sto_res_tmp1) iii iteration
+
+ vshl.s32 q6,q6,q14 @vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
+ vst1.32 {d4[0]},[r1]! @store pu1_dst i iteration
+ vmov.s32 d11,d10 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+
+ vqmovn.u16 d6,q3 @vqmovn_u16(sto_res_tmp3) ii iteration
+ vst1.32 {d6[0]},[r6],r3 @store pu1_dst ii iteration
+
+ vqmovn.u16 d10,q5 @vqmovn_u16(sto_res_tmp3) iii iteration
+ vqmovun.s32 d12,q6 @vqmovun_s32(sto_res_tmp1) iv iteration
+
+ vmov.s32 d13,d12 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
+ vst1.32 {d10[0]},[r6],r3 @store pu1_dst i iteration iii iteration
+ vqmovn.u16 d12,q6 @vqmovn_u16(sto_res_tmp3) iv iteration
+
+ subs r9,r9,#4 @decrement wd by 4 and check for 0
+ vst1.32 {d12[0]},[r6],r3 @store pu1_dst iv iteration
+ bgt core_loop @if greater than 0 repeat the core loop again
+
+end_core_loop:
+ rsb r11,r4,r2,lsl #2 @2*src_strd - wd
+ subs r8,r8,#4 @decrement the ht by 4
+ add r0,r0,r11 @pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
+ asr r9,r4,#1
+ rsb r12,r9,r3,lsl #2 @2*dst_strd - wd
+ add r1,r1,r12 @pu1_dst + dst_std - wd
+ bgt core_loop @if ht is greater than 0 goto outer_loop
+
+end_loops:
+ ldmfd sp!,{r4-r12,r15} @reload the registers from sp
+
+
diff --git a/common/arm64/ihevc_deblk_chroma_horz.s b/common/arm64/ihevc_deblk_chroma_horz.s
new file mode 100644
index 0000000..7097142
--- /dev/null
+++ b/common/arm64/ihevc_deblk_chroma_horz.s
@@ -0,0 +1,173 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///*******************************************************************************
+//* @file
+//* ihevc_deblk_luma_horz.s
+//*
+//* @brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* anand s
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//void ihevc_deblk_chroma_horz(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// WORD32 quant_param_p,
+// WORD32 quant_param_q,
+// WORD32 qp_offset_u,
+// WORD32 qp_offset_v,
+// WORD32 tc_offset_div2,
+// WORD32 filter_flag_p,
+// WORD32 filter_flag_q)
+//
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_horz_av8
+
+.type ihevc_deblk_chroma_horz_av8, %function
+
+ihevc_deblk_chroma_horz_av8:
+ sxtw x4,w4
+ sxtw x5,w5
+ sxtw x6,w6
+ ldr w9, [sp]
+ sxtw x9,w9
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ mov x10, x4
+ mov x8, x7
+ mov x7, x5
+ mov x4, x6
+
+ sub x12,x0,x1
+ ld1 {v0.8b},[x0]
+ sub x5,x12,x1
+ add x6,x0,x1
+ add x1,x2,x3
+ uxtl v0.8h, v0.8b
+ ld1 {v2.8b},[x12]
+ add x2,x1,#1
+ ld1 {v4.8b},[x5]
+ ld1 {v16.8b},[x6]
+ adds x1,x10,x2,asr #1
+ uxtl v2.8h, v2.8b
+ adrp x3, :got:gai4_ihevc_qp_table
+ ldr x3, [x3, #:got_lo12:gai4_ihevc_qp_table]
+ bmi l1.3312
+ cmp x1,#0x39
+ bgt lbl78
+ ldr w1, [x3,x1,lsl #2]
+lbl78:
+ sub x20,x1,#6
+ csel x1, x20, x1,gt
+l1.3312:
+ adds x2,x7,x2,asr #1
+ uxtl v4.8h, v4.8b
+ bmi l1.3332
+ cmp x2,#0x39
+ bgt lbl85
+ ldr w2, [x3,x2,lsl #2]
+lbl85:
+ sub x20,x2,#6
+ csel x2, x20, x2,gt
+l1.3332:
+ add x1,x1,x4,lsl #1
+ sub v6.8h, v0.8h , v2.8h
+ add x3,x1,#2
+ cmp x3,#0x35
+ mov x20,#0x35
+ csel x1, x20, x1,gt
+ shl v6.8h, v6.8h,#2
+ uxtl v16.8h, v16.8b
+ bgt l1.3368
+ adds x3,x1,#2
+ add x20,x1,#2
+ csel x1, x20, x1,pl
+ mov x20,#0
+ csel x1, x20, x1,mi
+l1.3368:
+ adrp x3, :got:gai4_ihevc_tc_table
+ ldr x3, [x3, #:got_lo12:gai4_ihevc_tc_table]
+ add v4.8h, v6.8h , v4.8h
+ add x2,x2,x4,lsl #1
+ sub v6.8h, v4.8h , v16.8h
+ add x4,x2,#2
+ ldr w1, [x3,x1,lsl #2]
+ cmp x4,#0x35
+ mov x20,#0x35
+ csel x2, x20, x2,gt
+ bgt l1.3412
+ adds x4,x2,#2
+ add x20,x2,#2
+ csel x2, x20, x2,pl
+ mov x20,#0
+ csel x2, x20, x2,mi
+l1.3412:
+
+
+ ldr w2, [x3,x2,lsl #2]
+ cmp x8,#0
+ dup v31.8h,w2
+ dup v30.8h,w1
+ sub x20,x1,#0
+ neg x1, x20
+ srshr v6.8h, v6.8h,#3
+ dup v28.8h,w1
+ sub x20,x2,#0
+ neg x1, x20
+ zip1 v4.8h, v30.8h, v31.8h
+ dup v29.8h,w1
+
+ zip1 v18.8h, v28.8h, v29.8h
+
+ smin v16.8h, v6.8h , v4.8h
+ smax v4.8h, v18.8h , v16.8h
+ add v2.8h, v2.8h , v4.8h
+ sub v0.8h, v0.8h , v4.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v0.8b, v0.8h
+ beq l1.3528
+ st1 {v2.8b},[x12]
+l1.3528:
+ cmp x9,#0
+ beq l1.3540
+ st1 {v0.8b},[x0]
+l1.3540:
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_deblk_chroma_vert.s b/common/arm64/ihevc_deblk_chroma_vert.s
new file mode 100644
index 0000000..dcb1f25
--- /dev/null
+++ b/common/arm64/ihevc_deblk_chroma_vert.s
@@ -0,0 +1,211 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* @file
+//* ihevc_deblk_luma_vert.s
+//*
+//* @brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* anand s
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************/
+//void ihevc_deblk_chroma_vert(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// WORD32 quant_param_p,
+// WORD32 quant_param_q,
+// WORD32 qp_offset_u,
+// WORD32 qp_offset_v,
+// WORD32 tc_offset_div2,
+// WORD32 filter_flag_p,
+// WORD32 filter_flag_q)
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.extern gai4_ihevc_qp_table
+.extern gai4_ihevc_tc_table
+.globl ihevc_deblk_chroma_vert_av8
+
+.type ihevc_deblk_chroma_vert_av8, %function
+
+ihevc_deblk_chroma_vert_av8:
+ sxtw x4,w4
+ sxtw x5,w5
+ sxtw x6,w6
+ mov x15,x5
+ mov x5,x6
+ mov x6,x15
+ mov x12, x7
+ mov x7, x4
+ ldr w4, [sp]
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ sub x8,x0,#4
+ add x2,x2,x3
+ ld1 {v5.8b},[x8],x1
+ add x2,x2,#1
+ ld1 {v17.8b},[x8],x1
+ ld1 {v16.8b},[x8],x1
+ ld1 {v4.8b},[x8]
+
+ trn1 v29.8b, v5.8b, v17.8b
+ trn2 v17.8b, v5.8b, v17.8b
+ mov v5.d[0], v29.d[0]
+ adds x3,x7,x2,asr #1
+ trn1 v29.8b, v16.8b, v4.8b
+ trn2 v4.8b, v16.8b, v4.8b
+ mov v16.d[0], v29.d[0]
+ adrp x7, :got:gai4_ihevc_qp_table
+ ldr x7, [x7, #:got_lo12:gai4_ihevc_qp_table]
+
+
+ bmi l1.2944
+ cmp x3,#0x39
+ bgt lbl78
+ ldr w3, [x7,x3,lsl #2]
+ sxtw x3,w3
+lbl78:
+ sub x20,x3,#6
+ csel x3, x20, x3,gt
+l1.2944:
+ trn1 v29.4h, v5.4h, v16.4h
+ trn2 v16.4h, v5.4h, v16.4h
+ mov v5.d[0], v29.d[0]
+ adds x2,x6,x2,asr #1
+ trn1 v29.4h, v17.4h, v4.4h
+ trn2 v4.4h, v17.4h, v4.4h
+ mov v17.d[0], v29.d[0]
+ bmi l1.2964
+ cmp x2,#0x39
+ bgt lbl86
+ ldr w2, [x7,x2,lsl #2]
+ sxtw x2,w2
+lbl86:
+ sub x20,x2,#6
+ csel x2, x20, x2,gt
+l1.2964:
+ trn1 v29.2s, v5.2s, v17.2s
+ trn2 v17.2s, v5.2s, v17.2s
+ mov v5.d[0], v29.d[0]
+ add x3,x3,x5,lsl #1
+ trn1 v29.2s, v16.2s, v4.2s
+ trn2 v4.2s, v16.2s, v4.2s
+ mov v16.d[0], v29.d[0]
+ add x6,x3,#2
+ uxtl v18.8h, v17.8b
+ cmp x6,#0x35
+ mov x20,#0x35
+ csel x3, x20, x3,gt
+ bgt l1.2996
+ adds x6,x3,#2
+ add x20,x3,#2
+ csel x3, x20, x3,pl
+ mov x20,#0
+ csel x3, x20, x3,mi
+l1.2996:
+ usubl v0.8h, v17.8b, v16.8b
+ adrp x6, :got:gai4_ihevc_tc_table
+ ldr x6, [x6, #:got_lo12:gai4_ihevc_tc_table]
+ shl v0.8h, v0.8h,#2
+ add x2,x2,x5,lsl #1
+ add x5,x2,#2
+ uaddw v0.8h, v0.8h , v5.8b
+ cmp x5,#0x35
+ ldr w3, [x6,x3,lsl #2]
+ sxtw x3,w3
+ usubw v4.8h, v0.8h , v4.8b
+ mov x20,#0x35
+ csel x2, x20, x2,gt
+ bgt l1.3036
+ adds x5,x2,#2
+ add x20,x2,#2
+ csel x2, x20, x2,pl
+ mov x20,#0
+ csel x2, x20, x2,mi
+l1.3036:
+
+
+ srshr v6.8h, v4.8h,#3
+ dup v2.4h,w3
+ ldr w2, [x6,x2,lsl #2]
+ sxtw x2,w2
+ sub x20,x3,#0
+ neg x3, x20
+ cmp x12,#0
+ dup v3.4h,w2
+ sub x20,x2,#0
+ neg x2, x20
+ dup v30.4h,w3
+ dup v31.4h,w2
+
+ mov v30.d[1],v31.d[0]
+ mov v2.d[1],v3.d[0]
+
+ smin v4.8h, v6.8h , v2.8h
+ smax v2.8h, v30.8h , v4.8h
+
+ uxtl v6.8h, v16.8b
+
+ add v0.8h, v6.8h , v2.8h
+ sub v2.8h, v18.8h , v2.8h
+ sqxtun v0.8b, v0.8h
+ sub x2,x0,#2
+ sqxtun v1.8b, v2.8h
+ trn1 v29.2s, v0.2s, v1.2s
+ trn2 v1.2s, v0.2s, v1.2s
+ mov v0.d[0], v29.d[0]
+ trn1 v29.8b, v0.8b, v1.8b
+ trn2 v1.8b, v0.8b, v1.8b
+ mov v0.d[0], v29.d[0]
+ beq l1.3204
+
+ st1 {v0.h}[0],[x2],x1
+ st1 {v1.h}[0],[x2],x1
+ st1 {v0.h}[1],[x2],x1
+ st1 {v1.h}[1],[x2]
+l1.3204:
+ cmp x4,#0
+ beq l1.3228
+ st1 {v0.h}[2],[x0],x1
+ st1 {v1.h}[2],[x0],x1
+ st1 {v0.h}[3],[x0],x1
+ st1 {v1.h}[3],[x0]
+l1.3228:
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s
new file mode 100644
index 0000000..a5c314d
--- /dev/null
+++ b/common/arm64/ihevc_deblk_luma_horz.s
@@ -0,0 +1,586 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///*******************************************************************************
+//* @file
+//* ihevc_deblk_luma_vert.s
+//*
+//* @brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* anand s
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************/
+
+.text
+.align 4
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+.globl ihevc_deblk_luma_horz_av8
+
+.type ihevc_deblk_luma_horz_av8, %function
+
+ihevc_deblk_luma_horz_av8:
+ // stmfd sp!, {x3-x12,x14}
+ sxtw x5,w5
+ sxtw x6,w6
+ stp d8,d9,[sp,#-16]!
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+
+ mov x21,x7
+ ldr w22,[sp,#96]
+
+ add x3,x3,x4
+ add x3,x3,#1
+ asr x3,x3,#1
+ add x7,x3,x5,lsl #1
+ add x3,x3,x6,lsl #1
+ cmp x7,#0x33
+ mov x20,#0x33
+ csel x7, x20, x7,gt
+ bgt l1.1532
+ cmp x7,#0x0
+ mov x20,#0x0
+ csel x7, x20, x7,lt // x7 has the beta_index value
+l1.1532:
+ // bic x2,x2,#1
+ asr x2,x2,#1
+
+ add x3,x3,x2,lsl #1
+ cmp x3,#0x35
+ mov x20,#0x35
+ csel x3, x20, x3,gt
+ bgt l1.1564
+ cmp x3,#0x0
+ mov x20,#0x0
+ csel x3, x20, x3,lt // x3 has the tc_index value
+
+ // qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
+ // beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
+ // tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
+
+l1.1564:
+ adrp x2, :got:gai4_ihevc_beta_table
+ ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
+
+ adrp x4, :got:gai4_ihevc_tc_table
+ ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
+
+ ldr w5, [x2,x7,lsl #2] // beta
+ ldr w6, [x4,x3,lsl #2] // tc
+
+
+
+ cmp x6,#0
+ beq l1.2404
+ movi v0.4h, #0x2
+ lsl x7,x6,#1
+ add x14,x1,x1,lsl #1
+ neg x19,x14
+ ldr w8, [x0,x19] // -3 value
+ dup v1.8b,w7
+ lsl x19,x1,#1
+ neg x19,x19
+ ldr w10, [x0,x19] //-2 value
+ dup v23.2s,w8 // -3 value
+ neg x19,x1
+ ldr w11, [x0,x19] //-1 value
+ dup v24.2s,w10 // -2 value
+ and x8,x8,#0xff
+ ldr w12, [x0,#0] // 0 value
+ dup v25.2s,w11 // -1 value
+ and x10,x10,#0xff
+ ldr w9, [x0,x1] // 1 value
+ dup v26.2s,w12 // 0 value
+ and x11,x11,#0xff
+ lsl x19,x1,#1
+ ldr w2, [x0,x19] // 2 value
+ dup v27.2s,w9 // 1value
+ and x12,x12,#0xff
+ dup v28.2s,w2 // 2 value
+ and x9,x9,#0xff
+ and x2,x2,#0xff
+
+ add x12,x12,x2
+ subs x9,x12,x9,lsl #1 // dq0 value is stored in x9
+ csneg x9,x9,x9,pl
+ //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
+
+ add x8,x8,x11
+ subs x8,x8,x10,lsl #1
+ csneg x8,x8,x8,pl // dp0 value is stored in x8
+ // dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
+
+
+
+ add x3,x1,x1,lsl #1
+ add x14,x0,#3
+
+
+ neg x19,x3
+ ldrb w2,[x14,x19] // -2 value
+ lsl x19,x1,#1
+ neg x19,x19
+ ldrb w10,[x14,x19] // -2 value
+ neg x19,x1
+ ldrb w11,[x14,x19] // -1 value
+ ldrb w12,[x14,#0] // 0 value
+ ldrb w3,[x14,x1] // 1 value
+ lsl x19,x1,#1
+ ldrb w4,[x14,x19] // 2 value
+
+
+ add x12,x12,x4
+ subs x12,x12,x3,lsl #1 // dq3value is stored in x12
+ csneg x12,x12,x12,pl
+ // dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
+
+
+ add x2,x2,x11
+ subs x11,x2,x10,lsl #1
+ csneg x11,x11,x11,pl // dp3 value is stored in x8
+ // dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
+
+
+
+ add x3,x8,x9 // x3 has the d0 value
+ add x4,x11,x12 // x4 has the d3 value
+
+
+ // d0 = dp0 + dq0@
+ // d3 = dp3 + dq3@
+
+ add x14,x8,x11 // x13 has the value dp
+ add x12,x12,x9 // x12 has the value dq
+ // dp = dp0 + dp3@
+ // dq = dq0 + dq3@
+
+ add x11, x3, x4 // x3 has the value d
+
+ // d = d0 + d3@
+
+
+ cmp x11,x5
+ bge l1.2404
+
+ // if(d < beta)
+
+
+ // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
+
+ // registers for use: x2,x7,x8,x9,x10,
+
+ asr x10,x5,#2
+ uqadd v30.8b, v26.8b , v1.8b
+ cmp x10,x3,lsl #1
+ uqsub v31.8b, v26.8b , v1.8b
+ ble l1.1840
+ add x10,x1,x1,lsl #1
+ uaddl v6.8h, v25.8b , v26.8b
+ neg x19,x1
+ ldr w2, [x0,x19,lsl #2] // has the -4 value
+ neg x19, x1
+ ldrb w7,[x0,x19] // has the -1 value
+ dup v22.2s,w2 // -4 value
+ uaddw v8.8h, v6.8h , v27.8b
+ ldrb w3,[x0,#0] // x4 has the 0 value
+ uqadd v16.8b, v27.8b , v1.8b
+ and x2,x2,#0xff
+ mul v12.8h, v8.8h, v0.4h[0]
+ ldr w8, [x0,x10] // has the 3 value
+ uaddl v10.8h, v24.8b , v28.8b
+ subs x2,x2,x7
+ uqsub v17.8b, v27.8b , v1.8b
+ dup v29.2s,w8 // 3 value
+ and x8,x8,#0xff
+ add v12.8h, v12.8h , v10.8h
+ csneg x2,x2,x2,pl
+ rshrn v20.8b, v12.8h,#3
+ subs x8,x8,x3
+ csneg x8,x8,x8,pl
+ umin v18.8b, v20.8b , v30.8b
+ add x8,x8,x2
+
+ cmp x8,x5,asr #3
+ bge l1.1840
+ uaddw v14.8h, v8.8h , v28.8b
+ subs x7,x3,x7
+ umax v4.8b, v18.8b , v31.8b
+ csneg x7,x7,x7,pl
+ uqadd v30.8b, v28.8b , v1.8b
+ mov x10,#5
+ rshrn v21.8b, v14.8h,#2
+ mul x10, x10, x6
+ uqsub v31.8b, v28.8b , v1.8b
+ add x10, x10,#1
+ cmp x7,x10,asr #1
+ umin v18.8b, v21.8b , v16.8b
+ bge l1.1840
+
+
+ // if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
+ // && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+ umax v5.8b, v18.8b , v17.8b
+ asr x10,x5,#2
+ uaddl v16.8h, v29.8b , v28.8b
+ cmp x10,x4,lsl #1
+ ble l1.1840
+
+ add x10,x1,x1,lsl #1
+ mul v16.8h, v16.8h, v0.4h[0]
+ add x4,x0,#3
+
+
+ lsl x19,x1,#2
+ neg x19,x19
+ ldrb w2,[x4,x19]
+ add v16.8h, v16.8h , v14.8h
+ neg x19,x1
+ ldrb w7,[x4,x19]
+ rshrn v19.8b, v16.8h,#3
+ ldrb w3,[x4,#0]
+ ldrb w8,[x4,x10]
+ // ubfx x7,x2,#24,#8 @ has the -1 value
+ // and x2,#0xff @ has the -4 value
+ // ubfx x8,x3,#24,#8 @ has the 3 value
+ // and x3,#0xff @ x4 has the 0 value
+
+
+
+ subs x8,x8,x3
+ umin v18.8b, v19.8b , v30.8b
+ csneg x8,x8,x8,pl
+ uaddl v6.8h, v25.8b , v24.8b
+ subs x2,x2,x7
+ umax v3.8b, v18.8b , v31.8b
+ csneg x2,x2,x2,pl
+ uaddw v8.8h, v6.8h , v26.8b
+ add x8,x8,x2
+ uqadd v30.8b, v25.8b , v1.8b
+ cmp x8,x5,asr #3
+ uqsub v31.8b, v25.8b , v1.8b
+ bge l1.1840
+ mul v12.8h, v8.8h, v0.4h[0]
+ subs x7,x3,x7
+ uqadd v16.8b, v24.8b , v1.8b
+ csneg x7,x7,x7,pl
+ uaddl v10.8h, v23.8b , v27.8b
+ mov x10,#5
+ uqsub v17.8b, v24.8b , v1.8b
+ mul x10, x10, x6
+ add v12.8h, v12.8h , v10.8h
+ add x10, x10,#1
+ rshrn v20.8b, v12.8h,#3
+ cmp x7,x10,asr #1
+ uaddw v14.8h, v8.8h , v23.8b
+ bge l1.1840
+ umin v18.8b, v20.8b , v30.8b
+ mov x2,#2
+ uqadd v30.8b, v23.8b , v1.8b
+ mov w4,w21
+ umax v2.8b, v18.8b , v31.8b
+ mov w5,w22
+ rshrn v21.8b, v14.8h,#2
+ b end_dep_deq_decision_horz
+ // x2 has the value of de
+ // x6 has teh value of tc
+ // x5 has the value of beta
+ // x14 has the value of dp
+ // x12 has the value of dq
+ // x0 has the value of source address
+ // x1 has the src stride
+
+l1.1840:
+ mov x2,#1
+
+ mov x11,x5
+ mov w4,w21
+ mov w5,w22
+
+ cmp x6,#1
+ mov x20,#0
+ csel x9, x20, x9,eq
+ mov x20,#0
+ csel x10, x20, x10,eq
+ beq end_dep_deq_decision_horz
+
+ and x7,x4,x5
+ cmp x7,#1
+ beq both_flags_set_horz
+ cmp x4,#0
+ beq set_flag_dep_zero_horz
+
+
+ add x8,x11,x11,asr #1
+ mov x10,#0
+ asr x8,x8,#3
+ cmp x8,x14
+ mov x20,#1
+ csel x9, x20, x9,gt
+ mov x20,#0
+ csel x9, x20, x9,le
+ b end_dep_deq_decision_horz
+set_flag_dep_zero_horz:
+
+ add x8,x11,x11,asr #1
+ mov x9,#0
+ asr x8,x8,#3
+ cmp x8,x12
+ mov x20,#1
+ csel x10, x20, x10,gt
+ mov x20,#0
+ csel x10, x20, x10,le
+ b end_dep_deq_decision_horz
+
+both_flags_set_horz:
+ add x8,x11,x11,asr #1
+ asr x8,x8,#3
+ cmp x8,x14
+ mov x20,#1
+ csel x9, x20, x9,gt
+ mov x20,#0
+ csel x9, x20, x9,le
+ cmp x8,x12
+ mov x20,#1
+ csel x10, x20, x10,gt
+ mov x20,#0
+ csel x10, x20, x10,le
+end_dep_deq_decision_horz:
+
+ //x0=source address
+ //x1=stride
+ // x2 =de
+ // x4=flag p
+ //x5= flag q
+ //x6 =tc
+ // x9 =dep
+ // x10=deq
+
+
+
+ // add x14,x1,x1,lsl #1
+ // lsl x7,x6,#1
+ // vdup.8 d1,x7
+ // vmov.i16 d0,#0x2
+ umin v18.8b, v21.8b , v16.8b
+ cmp x2,#1
+ uqsub v31.8b, v23.8b , v1.8b
+ beq l1.2408
+ uaddl v8.8h, v23.8b , v22.8b
+ cmp x5,#1
+
+ bne strong_filtering_p
+
+strong_filtering_q:
+ mov x12,x0
+ st1 {v4.s}[0],[x12],x1
+ st1 {v5.s}[0],[x12],x1
+ st1 {v3.s}[0],[x12]
+ cmp x4,#1
+ bne l1.2404
+strong_filtering_p:
+ umax v5.8b, v18.8b , v17.8b
+ mov x12,x0
+ mul v8.8h, v8.8h, v0.4h[0]
+ sub x20,x1,#0
+ neg x11, x20
+ add v16.8h, v8.8h , v14.8h
+ add x12,x12,x11
+ rshrn v19.8b, v16.8h,#3
+ st1 {v2.s}[0],[x12],x11
+ umin v18.8b, v19.8b , v30.8b
+ st1 {v5.s}[0],[x12],x11
+ umax v3.8b, v18.8b , v31.8b
+ st1 {v3.s}[0],[x12]
+
+l1.2404:
+ // ldmfd sp!, {x3-x12,pc}
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ret
+
+ // x4=flag p
+ //x5= flag q
+ //x6 =tc
+ // x9 =dep
+ // x10=deq
+
+
+ // d22 -4 value
+
+ //d23 @ -3 value
+
+ // vdup.32 d24,x11 @ -2 value
+
+ // vdup.32 d25, x11 @-1 value
+
+ // vdup.32 d26,x11 @ 0 value
+
+ // vdup.32 d27,x11 @ 1value
+
+ // vdup.32 d28,x11 @ 2 value
+
+ // vdup.32 d29,x11 @ 3 value
+
+l1.2408:
+
+ movi v0.4h, #0x9
+
+ usubl v10.8h, v26.8b , v25.8b
+
+ mul v10.8h, v10.8h, v0.4h[0]
+
+ movi v0.4h, #0x3
+
+ usubl v12.8h, v27.8b , v24.8b
+ mul v12.8h, v12.8h, v0.4h[0]
+
+
+ dup v30.8b,w6 // duplicating the +tc value
+
+ sub x20,x6,#0
+ neg x12, x20
+ dup v31.8b,w12 // duplicating the -tc value
+
+
+
+ sub v10.8h, v10.8h , v12.8h
+
+
+
+ srshr v10.8h, v10.8h,#4
+ // delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
+
+ abs v8.8h, v10.8h
+ xtn v9.8b, v8.8h
+ // storing the absolute values of delta in d9
+
+ sqxtn v10.8b, v10.8h
+ // storing the clipped values of delta in d16
+
+
+ smin v11.8b, v10.8b , v30.8b
+ smax v8.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)//
+
+
+ uxtl v6.8h, v25.8b
+
+ saddw v4.8h, v6.8h , v8.8b
+
+ sqxtun v12.8b, v4.8h
+ uxtl v6.8h, v26.8b
+ ssubw v4.8h, v6.8h , v8.8b
+ sqxtun v13.8b, v4.8h
+
+
+ mov x11,#0xa
+ mul x12, x11, x6
+ dup v2.8b,w12 // d2 has the 10*tc value
+ mov v18.8b, v24.8b
+ dup v0.8b,w6
+ sshr v0.8b,v0.8b,#1
+ neg v1.8b, v0.8b
+
+ cmp x4,#1
+ bne l1.2724
+ cmp x9,#1
+ bne l1.2700
+
+ // d12 and d13 have the value temp_p0 and temp_q0
+ uaddl v14.8h, v23.8b , v25.8b
+ rshrn v14.8b, v14.8h,#1
+ usubl v14.8h, v14.8b , v24.8b
+ saddw v14.8h, v14.8h , v8.8b
+ sqshrn v14.8b, v14.8h,#1
+ smin v15.8b, v14.8b , v0.8b
+ smax v14.8b, v1.8b , v15.8b
+
+ // d14 has the delta p value
+ uxtl v16.8h, v24.8b
+ saddw v16.8h, v16.8h , v14.8b
+ sqxtun v14.8b, v16.8h
+
+ // d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
+ cmhs v18.8b,v9.8b,v2.8b
+ bsl v18.8b,v24.8b,v14.8b
+
+l1.2700:
+ mov x12,x0
+ sub x20,x1,#0
+ neg x11, x20
+ add x12,x12,x11
+ cmhs v19.8b,v9.8b,v2.8b
+ bsl v19.8b,v25.8b,v12.8b
+ st1 {v19.s}[0],[x12],x11
+ st1 {v18.s}[0],[x12]
+l1.2724:
+ cmp x5,#1
+ bne l1.2404
+ cmp x10,#1
+ mov v18.8b, v27.8b
+ bne l1.2852
+
+ uaddl v14.8h, v26.8b , v28.8b
+ rshrn v14.8b, v14.8h,#1
+ usubl v14.8h, v14.8b , v27.8b
+ ssubw v14.8h, v14.8h , v8.8b
+ sqshrn v14.8b, v14.8h,#1
+ smin v15.8b, v14.8b , v0.8b
+ smax v14.8b, v1.8b , v15.8b
+// d14 has the delta p value
+ uxtl v16.8h, v27.8b
+ saddw v16.8h, v16.8h , v14.8b
+ sqxtun v14.8b, v16.8h
+ cmhs v18.8b,v9.8b,v2.8b
+ bsl v18.8b,v27.8b,v14.8b
+l1.2852:
+ mov x12,x0
+ cmhs v19.8b,v9.8b,v2.8b
+ bsl v19.8b,v26.8b,v13.8b
+ st1 {v19.s}[0],[x12],x1
+ st1 {v18.s}[0],[x12]
+ // ldmfd sp!, {x3-x12,x15}
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ret
+
+
diff --git a/common/arm64/ihevc_deblk_luma_vert.s b/common/arm64/ihevc_deblk_luma_vert.s
new file mode 100644
index 0000000..bc3cc6c
--- /dev/null
+++ b/common/arm64/ihevc_deblk_luma_vert.s
@@ -0,0 +1,635 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//* ihevc_deblk_luma_vert.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* anand s
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************/
+
+.text
+.align 4
+
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+
+.globl ihevc_deblk_luma_vert_av8
+
+.type ihevc_deblk_luma_vert_av8, %function
+
+ihevc_deblk_luma_vert_av8:
+
+ sxtw x5,w5
+ sxtw x6,w6
+ stp d8,d9,[sp,#-16]!
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ mov x21,x7
+ ldr w22,[sp,#96]
+ add x3,x3,x4
+ add x3,x3,#1
+ asr x3,x3,#1
+ add x7,x3,x5,lsl #1
+ add x3,x3,x6,lsl #1
+ cmp x7,#0x33
+ mov x20,#0x33
+ csel x7, x20, x7,gt
+ bgt l1.56
+ cmp x7,#0x0
+ mov x20,#0x0
+ csel x7, x20, x7,lt // x7 has the beta_index value
+l1.56:
+
+// bic x2,x2,#1
+ asr x2,x2,#1
+
+ add x3,x3,x2,lsl #1
+ cmp x3,#0x35
+ mov x20,#0x35
+ csel x3, x20, x3,gt
+ bgt l1.88
+ cmp x3,#0x0
+ mov x20,#0x0
+ csel x3, x20, x3,lt // x3 has the tc_index value
+
+// qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
+// beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
+// tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//
+
+l1.88:
+ adrp x2, :got:gai4_ihevc_beta_table
+ ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
+
+ movi v18.8b, #0x2
+ adrp x4, :got:gai4_ihevc_tc_table
+ ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
+
+ ldr w5,[x2,x7,lsl #2] // beta
+ movi v16.8h, #0x2
+ ldr w6,[x4,x3,lsl #2] // tc
+ lsl x8,x6,#1
+ cmp x6,#0
+ dup v19.8b,w8
+ sub x7,x0,#4
+ movi v23.8b, #0x3
+ beq l1.964
+
+
+ sub x19,x0,#3
+ ld1 {v15.8b},[x7],x1
+ ldrb w8,[x19] // -3 value
+ ld1 {v1.8b},[x7],x1
+ ldrb w10,[x19,#1] //-2 value
+ ld1 {v29.8b},[x7],x1
+ ldrb w11,[x19,#2] //-1 value
+ ld1 {v0.8b},[x7]
+ ldrb w12,[x0,#0] // 0 value
+ ldrb w9,[x0,#1] // 1 value
+ trn1 v24.8b,v15.8b,v1.8b
+ trn2 v1.8b,v15.8b,v1.8b
+ ldrb w2,[x0,#2] // 2 value
+ trn1 v2.8b,v29.8b,v0.8b
+ trn2 v0.8b,v29.8b,v0.8b
+ add x12,x12,x2
+ subs x9,x12,x9,lsl #1 // dq0 value is stored in x9
+ csneg x9,x9,x9,pl
+//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
+ mov v29.8b,v24.8b
+ trn1 v24.4h,v29.4h,v2.4h
+ trn2 v2.4h,v29.4h,v2.4h
+ add x8,x8,x11
+ mov v15.8b,v1.8b
+ trn1 v1.4h,v15.4h,v0.4h
+ trn2 v0.4h,v15.4h,v0.4h
+ subs x8,x8,x10,lsl #1
+ csneg x8,x8,x8,pl
+// dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//
+
+
+
+ add x14,x1,x1,lsl #1
+ add x14,x0,x14
+
+ sub x19,x14,#3
+ dup v4.2s, v24.2s[1]
+ ldrb w2,[x19] // -2 value
+ dup v7.2s, v2.2s[1]
+ ldrb w10,[x19,#1] // -2 value
+ dup v3.2s, v2.2s[0]
+ ldrb w11,[x19,#2] // -1 value
+ dup v5.2s, v1.2s[1]
+ ldrb w12,[x14,#0] // 0 value
+ dup v6.2s, v1.2s[0]
+ ldrb w3,[x14,#1] // 1 value
+ dup v2.2s, v0.2s[0]
+ ldrb w4,[x14,#2] // 2 value
+
+
+ add x12,x12,x4
+ subs x12,x12,x3,lsl #1 // dq3value is stored in x12
+ csneg x12,x12,x12,pl
+// dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//
+
+
+ add x2,x2,x11
+ subs x11,x2,x10,lsl #1
+ csneg x11,x11,x11,pl // dp3 value is stored in x8
+// dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )//
+
+
+
+ add x3,x8,x9 // x3 has the d0 value
+ add x4,x11,x12 // x4 has the d3 value
+
+
+// d0 = dp0 + dq0//
+// d3 = dp3 + dq3//
+
+ add x14,x8,x11 // x13 has the value dp
+ add x12,x12,x9 // x12 has the value dq
+// dp = dp0 + dp3//
+// dq = dq0 + dq3//
+
+ add x11, x3, x4 // x3 has the value d
+
+// d = d0 + d3//
+
+
+ cmp x11,x5
+ dup v22.2s, v0.2s[1]
+ bge l1.964
+
+// if(d < beta)
+
+
+ // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
+
+ // registers for use: x2,x7,x8,x9,x10,
+ uqsub v30.8b,v7.8b,v19.8b
+ asr x10,x5,#2
+ uqadd v31.8b,v7.8b,v19.8b
+ cmp x10,x3,lsl #1
+ uaddl v0.8h,v5.8b,v4.8b
+ ble l1.336
+
+ sub x19,x0,4
+ ldrb w2,[x19]
+ uaddw v0.8h, v0.8h , v2.8b
+ ldrb w7,[x19,#3]
+ umull v20.8h, v7.8b, v23.8b
+ ldrb w3,[x0,#0]
+ umlal v20.8h, v22.8b, v18.8b
+ ldrb w8,[x0,#3]
+// ubfx x7,x2,#24,#8 // has the -1 value
+// and x2,#0xff // has the -4 value
+// ubfx x8,x3,#24,#8 // has the 3 value
+// and x3,#0xff // x4 has the 0 value
+
+ add v20.8h, v20.8h , v0.8h
+ subs x8,x8,x3
+ rshrn v22.8b,v20.8h,#3
+ csneg x8,x8,x8,pl
+ subs x2,x2,x7
+ umin v21.8b, v22.8b , v31.8b
+ csneg x2,x2,x2,pl
+ umax v22.8b, v21.8b , v30.8b
+ add x8,x8,x2
+ uaddl v20.8h,v7.8b,v3.8b
+ cmp x8,x5,asr #3
+ mla v20.8h, v0.8h, v16.8h
+ bge l1.336
+ uaddw v0.8h, v0.8h , v7.8b
+ subs x7,x3,x7
+ rshrn v20.8b,v20.8h,#3
+ csneg x7,x7,x7,pl
+ rshrn v0.8b,v0.8h,#2
+ mov x10,#5
+ uqadd v30.8b,v5.8b,v19.8b
+ mul x10, x10, x6
+ uqsub v31.8b,v5.8b,v19.8b
+ add x10, x10,#1
+ cmp x7,x10,asr #1
+ bge l1.336
+
+
+// if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
+// && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+
+ asr x10,x5,#2
+ uqsub v25.8b,v4.8b,v19.8b
+ cmp x10,x4,lsl #1
+ uqadd v21.8b,v4.8b,v19.8b
+ ble l1.336
+ umin v26.8b, v20.8b , v21.8b
+ add x4,x1,x1,lsl #1
+ add x4,x4,x0
+ umax v20.8b, v26.8b , v25.8b
+ sub x19,x4,#4
+ ldrb w2,[x19]
+ umin v19.8b, v0.8b , v30.8b
+ ldrb w7,[x19,#3]
+ umax v21.8b, v19.8b , v31.8b
+ ldrb w3,[x4,#0]
+ lsl x10,x6,#1
+ ldrb w8,[x4,#3]
+// ubfx x7,x2,#24,#8 // has the -1 value
+// and x2,#0xff // has the -4 value
+// ubfx x8,x3,#24,#8 // has the 3 value
+// and x3,#0xff // x4 has the 0 value
+ uaddl v0.8h,v2.8b,v3.8b
+ dup v19.8b,w10
+ subs x8,x8,x3
+ uaddw v0.8h, v0.8h , v4.8b
+ csneg x8,x8,x8,pl
+ uqadd v30.8b,v2.8b,v19.8b
+ subs x2,x2,x7
+ uqsub v31.8b,v2.8b,v19.8b
+ csneg x2,x2,x2,pl
+ uaddl v26.8h,v5.8b,v6.8b
+ add x8,x8,x2
+ mla v26.8h, v0.8h, v16.8h
+ cmp x8,x5,asr #3
+ bge l1.336
+ rshrn v26.8b,v26.8h,#3
+ subs x7,x3,x7
+ uqadd v27.8b,v3.8b,v19.8b
+ csneg x7,x7,x7,pl
+ uqsub v28.8b,v3.8b,v19.8b
+ mov x10,#5
+ umin v16.8b, v26.8b , v30.8b
+ mul x10, x10, x6
+ add x10, x10,#1
+ cmp x7,x10,asr #1
+ umax v26.8b, v16.8b , v31.8b
+ bge l1.336
+ uqadd v30.8b,v6.8b,v19.8b
+
+ mov x2,#2
+ mov x4,x21
+ uqsub v31.8b,v6.8b,v19.8b
+ mov x5,x22
+ b end_dep_deq_decision
+// x2 has the value of de
+// x6 has teh value of tc
+// x5 has the value of beta
+// x14 has the value of dp
+// x12 has the value of dq
+// x0 has the value of source address
+// x1 has the src stride
+
+l1.336:
+ mov x2,#1
+l1.424:
+ mov x11,x5
+ mov x4,x21
+ mov x5,x22
+
+ cmp x6,#1
+ mov x20,#0
+ csel x9, x20, x9,eq
+ mov x20,#0
+ csel x10, x20, x10,eq
+ beq end_dep_deq_decision
+
+ and x7,x4,x5
+
+ cmp x7,#1
+ beq both_flags_set
+ cmp x4,#0
+ beq set_flag_dep_zero
+
+
+ add x8,x11,x11,asr #1
+ mov x10,#0
+ asr x8,x8,#3
+ cmp x8,x14
+ mov x20,#1
+ csel x9, x20, x9,gt
+ mov x20,#0
+ csel x9, x20, x9,le
+ b end_dep_deq_decision
+set_flag_dep_zero:
+
+ add x8,x11,x11,asr #1
+ mov x9,#0
+ asr x8,x8,#3
+ cmp x8,x12
+ mov x20,#1
+ csel x10, x20, x10,gt
+ mov x20,#0
+ csel x10, x20, x10,le
+ b end_dep_deq_decision
+
+both_flags_set:
+ add x8,x11,x11,asr #1
+ asr x8,x8,#3
+ cmp x8,x14
+ mov x20,#1
+ csel x9, x20, x9,gt
+ mov x20,#0
+ csel x9, x20, x9,le
+ cmp x8,x12
+ mov x20,#1
+ csel x10, x20, x10,gt
+ mov x20,#0
+ csel x10, x20, x10,le
+end_dep_deq_decision:
+
+//x0=source address
+//x1=stride
+// x2 =de
+// x4=flag p
+//x5= flag q
+//x6 =tc
+// x9 =dep
+// x10=deq
+// b l1.964
+
+
+ cmp x2,#2
+// x4 has the value of de
+ bne l1.968
+
+ cmp x5,#0
+ beq l1.780
+// x5 has the flag of q
+
+ add x3,x0,#2
+ st1 {v22.b}[0],[x3],x1
+
+ st1 {v22.b}[1],[x3],x1
+
+ st1 {v22.b}[2],[x3],x1
+
+ st1 {v22.b}[3],[x3]
+ add x3,x0,x1
+ mov v29.8b,v20.8b
+ trn1 v20.8b,v29.8b,v21.8b
+ trn2 v21.8b,v29.8b,v21.8b
+
+ st1 {v20.h}[0],[x0]
+ st1 {v21.h}[0],[x3],x1
+ st1 {v20.h}[1],[x3],x1
+ st1 {v21.h}[1],[x3]
+
+
+l1.780:
+ cmp x4,#0
+ beq l1.964
+ // x4 has the flag p
+
+
+ dup v7.2s, v24.2s[0]
+ sub x3,x0,#1
+ uaddw v16.8h, v0.8h , v6.8b
+ add x7,x3,x1
+ rshrn v2.8b,v16.8h,#2
+ st1 {v26.b}[0],[x3]
+ sub x0,x0,#3
+ umin v16.8b, v2.8b , v27.8b
+ st1 {v26.b}[1],[x7],x1
+ umull v2.8h, v6.8b, v23.8b
+ umlal v2.8h, v7.8b, v18.8b
+ st1 {v26.b}[2],[x7],x1
+ umax v5.8b, v16.8b , v28.8b
+ st1 {v26.b}[3],[x7]
+ add v0.8h, v2.8h , v0.8h
+ rshrn v0.8b,v0.8h,#3
+
+
+ umin v1.8b, v0.8b , v30.8b
+ umax v0.8b, v1.8b , v31.8b
+
+ mov v29.8b,v0.8b
+ trn1 v0.8b,v29.8b,v5.8b
+ trn2 v5.8b,v29.8b,v5.8b
+ st1 {v0.h}[0],[x0],x1
+ st1 {v5.h}[0],[x0],x1
+ st1 {v0.h}[1],[x0],x1
+ st1 {v5.h}[1],[x0]
+l1.964:
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ret
+
+l1.968:
+
+
+ movi v0.8h, #0x9
+ neg x11, x6
+ cmp x4,#0
+ // checks for the flag p
+ movi v16.8h, #0x3
+ movi v24.8b, #0x1
+
+
+ dup v30.8b,w11
+ and x11,x6,#0xff
+ dup v31.8b,w11
+
+ usubl v18.8h,v4.8b,v2.8b
+ mul v18.8h, v18.8h, v0.8h
+ usubl v0.8h,v5.8b,v3.8b
+
+
+
+ mul v16.8h, v0.8h, v16.8h
+ sub v16.8h, v18.8h , v16.8h
+ srshr v16.8h,v16.8h,#4
+// delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//
+
+ abs v0.8h, v16.8h
+ xtn v0.8b, v0.8h
+ // storing the absolute values of delta in d0
+
+ sqxtn v16.8b,v16.8h
+ // storing the clipped values of delta in d16
+
+ movi v1.8b, #0xa
+ dup v21.8b,w11
+ mul v1.8b, v1.8b, v21.8b
+ // d1 stores the value (10 * tc)
+
+//if(abs(delta) < 10 * tc)
+
+ smin v18.8b, v16.8b , v31.8b
+ smax v20.8b, v18.8b , v30.8b
+
+// delta = clip3(delta, -tc, tc)//
+ sxtl v16.8h, v20.8b
+ uxtl v18.8h, v2.8b
+ add v18.8h, v18.8h , v16.8h
+
+ sqxtun v22.8b, v18.8h
+ uxtl v18.8h, v4.8b
+ sub v16.8h, v18.8h , v16.8h
+ sqxtun v23.8b, v16.8h
+// tmp_p0 = clip_u8(pu1_src[-1] + delta)//
+// tmp_q0 = clip_u8(pu1_src[0] - delta)//
+ beq l1.1272
+
+
+
+ cmp x9,#1
+ bne l1.1212
+// checks for the flag dep
+
+ asr x3,x6,#1
+
+
+ uaddl v16.8h,v6.8b,v2.8b
+ uaddw v16.8h, v16.8h , v24.8b
+ dup v18.8b,w3
+ sub x20,x3,#0
+ neg x3, x20
+ dup v19.8b,w3
+ ushr v16.8h,v16.8h,#1
+ xtn v16.8b, v16.8h
+
+ usubl v16.8h,v16.8b,v3.8b
+ saddw v16.8h, v16.8h , v20.8b
+ sshr v16.8h,v16.8h,#1
+ sqxtn v16.8b,v16.8h
+
+ smin v17.8b, v16.8b , v18.8b
+ smax v16.8b, v19.8b , v17.8b
+
+
+
+
+ uxtl v18.8h, v3.8b
+ sxtl v16.8h, v16.8b
+ add v16.8h, v18.8h , v16.8h
+
+ sqxtun v16.8b, v16.8h
+ mov v30.8b,v3.8b
+ cmhs v3.8b,v0.8b,v1.8b
+
+
+ bsl v3.8b,v30.8b,v16.8b
+l1.1212:
+ dup v16.8b,w11
+ sub x12,x0,#3
+ sub x3,x0,#1
+// smul v16.8b, v16.8b, v1.8b
+ mov v29.8b,v6.8b
+ trn1 v6.8b,v29.8b,v3.8b
+ trn2 v3.8b,v29.8b,v3.8b
+ st1 {v6.h}[0],[x12],x1
+ cmhs v16.8b,v0.8b,v1.8b
+ st1 {v3.h}[0],[x12],x1
+ bsl v16.8b,v2.8b,v22.8b
+ st1 {v16.b}[0],[x3],x1
+ st1 {v16.b}[1],[x3],x1
+ st1 {v6.h}[1],[x12],x1
+ st1 {v16.b}[2],[x3],x1
+ st1 {v3.h}[1],[x12]
+ st1 {v16.b}[3],[x3]
+l1.1272:
+ cmp x5,#0
+ beq l1.964
+ // checks for the flag q
+ cmp x10,#1
+ bne l1.1412
+ // checks for the flag deq
+ mov v2.8b,v7.8b
+ asr x3,x6,#1
+
+ dup v6.8b,w3
+ sub x20,x3,#0
+ neg x3, x20
+ dup v16.8b,w3
+ uaddl v2.8h,v2.8b,v4.8b
+ uaddw v2.8h, v2.8h , v24.8b
+ ushr v2.8h,v2.8h,#1
+ xtn v2.8b, v2.8h
+
+ usubl v2.8h,v2.8b,v5.8b
+ ssubw v2.8h, v2.8h , v20.8b
+ sshr v2.8h,v2.8h,#1
+ sqxtn v3.8b,v2.8h
+
+ smin v2.8b, v3.8b , v6.8b
+ smax v3.8b, v16.8b , v2.8b
+ // dup v6.8b,w2
+ // smul v6.8b, v6.8b, v1.8b
+
+
+
+ uxtl v16.8h, v5.8b
+ sxtl v2.8h, v3.8b
+ add v2.8h, v16.8h , v2.8h
+ sqxtun v3.8b, v2.8h
+ mov v30.8b,v5.8b
+ cmhs v5.8b,v0.8b,v1.8b
+
+
+ bsl v5.8b,v30.8b,v3.8b
+l1.1412:
+ // dup v2.8b,w2
+ add x3,x0,#2
+ add x11,x3,x1
+ // smul v1.8b, v2.8b, v1.8b
+ st1 {v7.b}[0],[x3]
+ st1 {v7.b}[1],[x11],x1
+ st1 {v7.b}[2],[x11],x1
+ cmhs v0.8b,v0.8b,v1.8b
+ st1 {v7.b}[3],[x11]
+ bsl v0.8b,v4.8b,v23.8b
+ mov v29.8b,v0.8b
+ trn1 v0.8b,v29.8b,v5.8b
+ trn2 v5.8b,v29.8b,v5.8b
+ st1 {v0.h}[0],[x0],x1
+ st1 {v5.h}[0],[x0],x1
+ st1 {v0.h}[1],[x0],x1
+ st1 {v5.h}[1],[x0]
+
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ret
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_copy.s b/common/arm64/ihevc_inter_pred_chroma_copy.s
new file mode 100644
index 0000000..7ac6855
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_copy.s
@@ -0,0 +1,256 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_inter_pred_chroma_copy.s
+//*
+//* @brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using ARM
+//* RVCT
+//*
+//* @author
+//* Yogeswaran RS
+//*
+//* @par List of Functions:
+//*
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* Chroma interprediction filter for copy
+//*
+//* @par Description:
+//* Copies the array of width 'wd' and height 'ht' from the location pointed
+//* by 'src' to the location pointed by 'dst'
+//*
+//* @param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* WORD8 pointer to the filter coefficients
+//*
+//* @param[in] ht
+//* integer height of the array
+//*
+//* @param[in] wd
+//* integer width of the array
+//*
+//* @returns
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src,
+// UWORD8 *pu1_dst,
+// WORD32 src_strd,
+// WORD32 dst_strd,
+// WORD8 *pi1_coeff,
+// WORD32 ht,
+// WORD32 wd)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => *pu1_dst
+//x2 => src_strd
+//x3 => dst_strd
+//x4 => *pi1_coeff
+//x5 => ht
+//x6 => wd
+
+.text
+.align 4
+
+.globl ihevc_inter_pred_chroma_copy_av8
+
+.type ihevc_inter_pred_chroma_copy_av8, %function
+
+ihevc_inter_pred_chroma_copy_av8:
+
+ LSL x12,x6,#1 //wd << 1
+ CMP x5,#0 //checks ht == 0
+ BLE END_LOOPS
+ AND x8,x5,#3 //check ht for mul of 2
+ SUB x5,x5,x8 //check the rounded height value
+ TST x12,#15 //checks wd for multiples for 16
+ BEQ CORE_LOOP_WD_16
+ TST x12,#7 //checks wd for multiples for 4 & 8
+ BEQ CORE_LOOP_WD_8
+ SUB x11,x12,#4
+ CMP x5,#0
+ BEQ OUTER_LOOP_WD_4_HT_2
+
+OUTER_LOOP_WD_4:
+ SUBS x4,x12,#0 //checks wd == 0
+ BLE END_INNER_LOOP_WD_4
+
+INNER_LOOP_WD_4:
+ LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ ADD x7,x0,x2 //pu1_src_tmp += src_strd
+ ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
+ ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ ADD x0,x0,#4 //pu1_src += 4
+ ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ SUBS x4,x4,#4 //(wd -4)
+ ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ ADD x1,x1,#4 //pu1_dst += 4
+ ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ BGT INNER_LOOP_WD_4
+
+END_INNER_LOOP_WD_4:
+ SUBS x5,x5,#4 //ht - 4
+ SUB x0,x7,x11 //pu1_src = pu1_src_tmp
+ SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp
+ BGT OUTER_LOOP_WD_4
+ CMP x8,#0
+ BGT OUTER_LOOP_WD_4_HT_2
+
+END_LOOPS:
+ RET
+
+OUTER_LOOP_WD_4_HT_2:
+ SUBS x4,x12,#0 //checks wd == 0
+ BLE END_LOOPS
+
+INNER_LOOP_WD_4_HT_2:
+ LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ ADD x7,x0,x2 //pu1_src_tmp += src_strd
+ ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
+ ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ ADD x0,x0,#4 //pu1_src += 4
+ ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ SUBS x4,x4,#4 //(wd -4)
+ ADD x1,x1,#4 //pu1_dst += 4
+ BGT INNER_LOOP_WD_4_HT_2
+ B END_LOOPS
+
+CORE_LOOP_WD_8:
+ SUB x11,x12,#8
+ CMP x5,#0
+ BEQ OUTER_LOOP_WD_8_HT_2
+
+OUTER_LOOP_WD_8:
+ SUBS x4,x12,#0 //checks wd
+ BLE END_INNER_LOOP_WD_8
+
+
+INNER_LOOP_WD_8:
+ ADD x7,x0,x2 //pu1_src_tmp += src_strd
+ LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
+ ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src)
+ LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ SUBS x4,x4,#8 //wd - 8(Loop condition)
+ LD1 {v2.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v2.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ LD1 {v3.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v3.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ BGT INNER_LOOP_WD_8
+
+END_INNER_LOOP_WD_8:
+ SUBS x5,x5,#4 //ht -= 4
+ SUB x0,x7,x11 //pu1_src = pu1_src_tmp
+ SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp
+ BGT OUTER_LOOP_WD_8
+ CMP x8,#0
+ BGT OUTER_LOOP_WD_8_HT_2
+ B END_LOOPS
+
+OUTER_LOOP_WD_8_HT_2:
+ SUBS x4,x12,#0 //checks wd
+ BLE END_LOOPS
+
+INNER_LOOP_WD_8_HT_2:
+ ADD x7,x0,x2 //pu1_src_tmp += src_strd
+ LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
+ ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src)
+ LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ B END_LOOPS
+
+CORE_LOOP_WD_16:
+ SUB x11,x12,#16
+ CMP x5,#0
+ BEQ OUTER_LOOP_WD_16_HT_2
+
+OUTER_LOOP_WD_16:
+ SUBS x4,x12,#0 //checks wd
+ BLE END_INNER_LOOP_WD_16
+
+INNER_LOOP_WD_16:
+ ADD x7,x0,x2 //pu1_src_tmp += src_strd
+ LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp)
+ ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
+ ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src)
+ LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ SUBS x4,x4,#16 //wd - 16(Loop condition)
+ LD1 {v2.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v2.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ LD1 {v3.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v3.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ BGT INNER_LOOP_WD_16
+
+END_INNER_LOOP_WD_16:
+ SUBS x5,x5,#4 //ht -= 4
+ SUB x0,x7,x11 //pu1_src = pu1_src_tmp
+ SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp
+ BGT OUTER_LOOP_WD_16
+ CMP x8,#0
+ BGT OUTER_LOOP_WD_16_HT_2
+ B END_LOOPS
+
+OUTER_LOOP_WD_16_HT_2:
+ SUBS x4,x12,#0 //checks wd
+ BLE END_LOOPS
+
+INNER_LOOP_WD_16_HT_2:
+ ADD x7,x0,x2 //pu1_src_tmp += src_strd
+ LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp)
+ ADD x6,x1,x3 //pu1_dst_tmp += dst_strd
+ ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src)
+ LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp)
+ ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+
+ RET
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
new file mode 100644
index 0000000..e479651
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
@@ -0,0 +1,348 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_inter_pred_chroma_copy_w16out_neon.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* chroma interprediction filter for copy
+//*
+//* //par description:
+//* copies the array of width 'wd' and height 'ht' from the location pointed
+//* by 'src' to the location pointed by 'dst'
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
+// word16 *pi2_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 => src_strd
+//x3 => dst_strd
+//x4 => *pi1_coeff
+//x5 => ht
+//x6 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_copy_w16out_av8
+
+.type ihevc_inter_pred_chroma_copy_w16out_av8, %function
+
+ihevc_inter_pred_chroma_copy_w16out_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+
+ mov x12,x17 //loads wd
+ lsl x12,x12,#1 //2*wd
+ mov x7,x16 //loads ht
+ cmp x7,#0 //ht condition(ht == 0)
+ ble end_loops //loop
+ and x8,x7,#3 //check ht for mul of 2
+ sub x9,x7,x8 //check the rounded height value
+ and x11,x7,#6
+ cmp x11,#6
+ beq loop_ht_6
+ tst x12,#7 //conditional check for wd (multiples)
+ beq core_loop_wd_8
+
+loop_ht_6:
+ sub x11,x12,#4
+ lsl x6, x3,#1
+ adds x6, x6,#0
+ cmp x9,#0
+ beq outer_loop_wd_4_ht_2
+
+outer_loop_wd_4:
+ subs x4,x12,#0 //wd conditional subtract
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp)
+ add x5,x0,x2 //pu1_src +src_strd
+ uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ add x10,x1,x6
+ subs x4,x4,#4 //wd - 4
+ shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6)
+ ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ add x0,x0,#4 //pu1_src += 4
+ st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ add x1,x1,#8
+ uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6)
+ uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6)
+ ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v24.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6)
+ st1 {v26.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs x9,x9,#4 //ht - 4
+ sub x0,x5,x11
+ sub x1,x10,x11,lsl #1
+ bgt outer_loop_wd_4
+ cmp x8,#0
+ bgt outer_loop_wd_4_ht_2
+
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+outer_loop_wd_4_ht_2:
+ subs x4,x12,#0 //wd conditional subtract
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4_ht_2:
+ ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp)
+ add x5,x0,x2 //pu1_src +src_strd
+ uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ add x10,x1,x6
+ subs x4,x4,#4 //wd - 4
+ shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6)
+ ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ add x0,x0,#4 //pu1_src += 4
+ st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ add x1,x1,#8
+ uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6)
+ uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ bgt inner_loop_wd_4_ht_2
+ b end_loops
+
+
+core_loop_wd_8:
+ //sub x11,x12,#8
+ lsl x5, x3,#1
+ adds x5, x5,#0
+ sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width
+ neg x11, x20
+ sub x20,x12,x2,lsl #2 //x2->src_strd
+ neg x8, x20
+ lsr x4, x12, #3 // divide by 8
+ mov x7,x9
+ mul x7, x7, x4
+ sub x4,x12,#0 //wd conditional check
+ sub x7,x7,#4 //subtract one for epilog
+ cmp x9,#0
+ beq core_loop_wd_8_ht_2
+
+prolog:
+ add x6,x0,x2 //pu1_src_tmp += src_strd
+ add x10,x1,x5
+ ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ subs x4,x4,#8 //wd decrements by 8
+ shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
+ add x20,x0,x8
+ csel x0, x20, x0,le
+ add x6,x0,x2 //pu1_src_tmp += src_strd
+ ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+
+ st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
+ add x20,x1,x11,lsl #1
+ csel x1, x20, x1,le
+ sub x20,x12,#0 //wd conditional check
+ csel x4, x20, x4,le
+
+ subs x7,x7,#4 //ht - 4
+
+ blt epilog_end //jumps to epilog_end
+ beq epilog //jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+ st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ subs x4,x4,#8 //wd decrements by 8
+ add x20,x0,x8
+ csel x0, x20, x0,le
+
+ add x6,x0,x2 //pu1_src_tmp += src_strd
+
+ ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ add x10,x1,x5
+
+ shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
+
+ add x20,x1,x11,lsl #1
+ csel x1, x20, x1,le
+ sub x20,x12,#0 //wd conditional check
+ csel x4, x20, x4,le
+
+ subs x7,x7,#4 //ht - 4
+ bgt outer_loop_wd_8
+
+epilog:
+ st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ //add x6,x0,x2 //pu1_src_tmp += src_strd
+
+ shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
+ add x10,x1,x5
+ shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+ st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ b end_loops
+
+core_loop_wd_8_ht_2:
+ add x6,x0,x2 //pu1_src_tmp += src_strd
+ add x10,x1,x5
+ ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ subs x12,x12,#8 //wd decrements by 8
+ shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
+ st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
+ st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ bgt core_loop_wd_8_ht_2
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz.s b/common/arm64/ihevc_inter_pred_chroma_horz.s
new file mode 100644
index 0000000..cf4f0f9
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_horz.s
@@ -0,0 +1,771 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_inter_pred_chroma_horz_neon.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs / akshaya mukund
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* chroma interprediction filter for horizontal input
+//*
+//* //par description:
+//* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+//* to the elements pointed by 'pu1_src' and writes to the location pointed
+//* by 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
+//* assumptions : the function is optimized considering the fact width is
+//* multiple of 2,4 or 8. if width is 2, then height should be multiple of 2.
+//* width 4,8 is optimized further
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 => src_strd
+//x3 => dst_strd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_horz_av8
+
+.type ihevc_inter_pred_chroma_horz_av8, %function
+
+ihevc_inter_pred_chroma_horz_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+
+ mov x4,x15 //loads pi1_coeff
+ mov x7,x16 //loads ht
+ mov x10,x17 //loads wd
+
+ ld1 {v0.8b},[x4] //coeff = vld1_s8(pi1_coeff)
+ subs x14,x7,#0 //checks for ht == 0
+ abs v2.8b, v0.8b //vabs_s8(coeff)
+ mov x11,#2
+ ble end_loops
+
+ dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub x12,x0,#2 //pu1_src - 2
+ dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
+ dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+ tst x10,#3 //checks wd for multiples
+ lsl x5, x10, #1
+
+ dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+ bne outer_loop_4
+ cmp x10,#12
+ beq skip_16
+
+ cmp x10,#8
+ bge outer_loop_16
+skip_16:
+ tst x7,#3
+
+ sub x9,x0,#2
+ beq outer_loop_ht_4 //jumps to else condition
+
+ b outer_loop_8
+
+
+outer_loop_16:
+ mov x10,x5 //2wd
+ mul x14, x14 , x10
+
+ sub x20,x3,#16
+ neg x6, x20
+
+ add x4,x12,x2
+ mov x9,#10
+ and x0, x12, #31
+ sub x20,x5,x3,lsl #1
+ neg x8, x20
+ add x20,x12, x2 , lsl #1
+ prfm PLDL1KEEP,[x20]
+
+
+
+ add x19,x12,#8
+ ld1 { v0.2s},[x12],x11 //vector load pu1_src
+ ld1 { v1.2s},[x19],x11 //vector load pu1_src
+ add x20,x4, x2 , lsl #1
+ prfm PLDL1KEEP,[x20]
+
+ ld1 { v2.2s},[x12],x11 //vector load pu1_src
+ ld1 { v3.2s},[x19],x11 //vector load pu1_src
+
+ ld1 { v4.2s},[x12],x11 //vector load pu1_src
+ ld1 { v5.2s},[x19],x11 //vector load pu1_src
+
+ ld1 { v6.2s},[x12],x9 //vector load pu1_src
+ ld1 { v7.2s},[x19],x9 //vector load pu1_src
+
+
+ add x19,x4,#8
+ umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v9.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 { v10.2s},[x4],x11 //vector load pu1_src
+ ld1 { v11.2s},[x19],x11 //vector load pu1_src
+
+ umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 { v12.2s},[x4],x11 //vector load pu1_src
+ ld1 { v13.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 { v14.2s},[x4],x9 //vector load pu1_src
+ ld1 { v15.2s},[x19],x9 //vector load pu1_src
+
+ umull v28.8h, v3.8b, v25.8b
+
+ umlsl v28.8h, v1.8b, v24.8b
+
+
+ umlal v28.8h, v5.8b, v26.8b
+
+ umlsl v28.8h, v7.8b, v27.8b
+
+
+ cmp x14,#32
+ beq epilog_end
+ sub x14, x14,#64
+
+inner_loop_16:
+
+
+
+
+// bgt l_2
+
+// add x20,x12, x2 , lsl #1
+ prfm PLDL1KEEP,[x20]
+// add x20,x4, x2 , lsl #1
+ prfm PLDL1KEEP,[x20]
+
+
+
+ subs x10,x10,#16
+
+ umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+ add x20,x12,x8
+ csel x12, x20, x12,eq
+ add x20,x12,x2
+ csel x4, x20, x4,eq
+ umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+
+
+ add x20,x12, x2 , lsl #2
+ prfm PLDL1KEEP,[x20]
+ sqrshrun v30.8b, v30.8h,#6
+
+ add x19,x12,#8
+ ld1 { v0.2s},[x12],x11 //vector load pu1_src
+ ld1 { v1.2s},[x19],x11 //vector load pu1_src
+
+ sqrshrun v31.8b, v28.8h,#6
+
+
+
+ ld1 { v2.2s},[x12],x11 //vector load pu1_src
+ ld1 { v3.2s},[x19],x11 //vector load pu1_src
+ umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+
+
+
+ ld1 { v4.2s},[x12],x11 //vector load pu1_src
+ ld1 { v5.2s},[x19],x11 //vector load pu1_src
+ umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+ ld1 { v6.2s},[x12],x9 //vector load pu1_src
+ ld1 { v7.2s},[x19],x9 //vector load pu1_src
+ umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ add x20,x4, x2 , lsl #2
+ prfm PLDL1KEEP,[x20]
+ umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ //mov v30.s[1],v31.s[0]
+ add x13,x1,#8
+ st1 { v30.4h}, [x1],x3
+ st1 { v31.4h}, [x13],x3
+ umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ add x19,x4,#8
+ ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v9.2s},[x19],x11 //vector load pu1_src
+ umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+ ld1 { v10.2s},[x4],x11 //vector load pu1_src
+ ld1 { v11.2s},[x19],x11 //vector load pu1_src
+ umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 { v12.2s},[x4],x11 //vector load pu1_src
+ ld1 { v13.2s},[x19],x11 //vector load pu1_src
+ umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 { v14.2s},[x4],x9 //vector load pu1_src
+ ld1 { v15.2s},[x19],x11 //vector load pu1_src
+ umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ cmp x10,#0
+ sqrshrun v22.8b, v22.8h,#6
+ sqrshrun v23.8b, v20.8h,#6
+
+
+
+ umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ csel x10, x5, x10,eq //2wd
+ umull v28.8h, v3.8b, v25.8b
+
+
+ //add x13,x1,#8
+ //mov v22.s[1],v23.s[0]
+ st1 { v22.4h},[x1],x6 //store the result pu1_dst
+ st1 { v23.4h},[x13],x6 //store the result pu1_dst
+ umlsl v28.8h, v1.8b, v24.8b
+
+
+ add x20,x1,x8
+ csel x1, x20, x1,eq
+ umlal v28.8h, v5.8b, v26.8b
+
+ subs x14,x14,#32 //decrement the ht loop
+ umlsl v28.8h, v7.8b, v27.8b
+
+// mov x0, x7
+
+ bgt inner_loop_16
+
+
+
+ add x14,x14,#64
+ cmp x14,#32
+ beq epilog_end
+
+epilog:
+ sqrshrun v30.8b, v30.8h,#6
+ sqrshrun v31.8b, v28.8h,#6
+
+
+
+ add x13,x1,#8
+ //mov v30.s[1],v31.s[0]
+ st1 { v30.4h}, [x1],x3
+ st1 { v31.4h}, [x13],x3
+
+ umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+
+
+ umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ subs x10,x10,#16 //decrement the wd loop
+ umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ add x20,x12,x8
+ csel x12, x20, x12,eq
+ umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ csel x10, x5, x10,eq //2wd
+
+
+ add x20,x12,x2
+ csel x4, x20, x4,eq
+ umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ add x19,x12,#8
+ ld1 { v0.2s},[x12],x11 //vector load pu1_src
+ ld1 { v1.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ ld1 { v2.2s},[x12],x11 //vector load pu1_src
+ ld1 { v3.2s},[x19],x11 //vector load pu1_src
+ umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 { v4.2s},[x12],x11 //vector load pu1_src
+ ld1 { v5.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 { v6.2s},[x12],x9 //vector load pu1_src
+ ld1 { v7.2s},[x19],x9 //vector load pu1_src
+ umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+ add x19,x4,#8
+ ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v9.2s},[x19],x11 //vector load pu1_src
+ umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ ld1 { v10.2s},[x4],x11 //vector load pu1_src
+ ld1 { v11.2s},[x19],x11 //vector load pu1_src
+ umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 { v12.2s},[x4],x11 //vector load pu1_src
+ ld1 { v13.2s},[x19],x11 //vector load pu1_src
+ umull v28.8h, v3.8b, v25.8b
+ ld1 { v14.2s},[x4],x9 //vector load pu1_src
+ ld1 { v15.2s},[x19],x9 //vector load pu1_src
+ umlsl v28.8h, v1.8b, v24.8b
+ sqrshrun v22.8b, v22.8h,#6
+ sqrshrun v23.8b, v20.8h,#6
+
+ //mov v22.s[1],v23.s[0]
+ st1 { v22.4h},[x1],x6 //store the result pu1_dst
+ st1 { v23.4h},[x13],x6 //store the result pu1_dst
+ umlal v28.8h, v5.8b, v26.8b
+
+ umlsl v28.8h, v7.8b, v27.8b
+ add x20,x1,x8
+ csel x1, x20, x1,eq
+
+
+
+epilog_end:
+ sqrshrun v30.8b, v30.8h,#6
+ sqrshrun v31.8b, v28.8h,#6
+
+
+ umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+ umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ sqrshrun v22.8b, v22.8h,#6
+ sqrshrun v23.8b, v20.8h,#6
+
+ add x13,x1,#8
+
+ //mov v30.s[1],v31.s[0]
+ st1 { v30.4h}, [x1],x3
+ st1 { v31.4h}, [x13],x3
+
+ //mov v22.s[1],v23.s[0]
+ st1 { v22.4h},[x1] //store the result pu1_dst
+ st1 { v23.4h},[x13] //store the result pu1_dst
+
+
+
+ b end_loops
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+outer_loop_8:
+
+
+ add x6,x1,x3 //pu1_dst + dst_strd
+ mov x7,x5
+ add x4,x12,x2 //pu1_src + src_strd
+
+
+inner_loop_8:
+ //ld1 {v0.2s, v1.2s},[x12],x11 //vector load pu1_src
+ ld1 {v0.2s},[x12],x11 //vector load pu1_src
+ ld1 {v1.2s},[x12],x11 //vector load pu1_src
+ ld1 {v2.2s},[x12],x11 //vector load pu1_src
+ ld1 {v3.2s},[x12],x11 //vector load pu1_src
+
+ //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ umull v8.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+ //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+ umlal v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v8.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 {v4.2s},[x4],x11 //vector load pu1_src
+ ld1 {v5.2s},[x4],x11 //vector load pu1_src
+ ld1 {v6.2s},[x4],x11 //vector load pu1_src
+ ld1 {v7.2s},[x4],x11 //vector load pu1_src
+ //ld1 {v12.2s, v13.2s},[x4],x11 //vector load pu1_src + src_strd
+ //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2]
+ umull v10.8h, v5.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v10.8h, v4.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+ sqrshrun v8.8b, v8.8h,#6 //right shift and saturating narrow result 1
+ umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v8.8b},[x1],#8 //store the result pu1_dst
+
+ sqrshrun v10.8b, v10.8h,#6 //right shift and saturating narrow result 2
+ subs x7,x7,#8 //decrement the wd loop
+ st1 {v10.8b},[x6],#8 //store the result pu1_dst
+ bgt inner_loop_8
+
+ sub x12,x12,x5
+ subs x14,x14,#2 //decrement the ht loop
+ sub x1,x1,x5
+ add x12,x12,x2,lsl #1
+ add x1,x1,x3,lsl #1
+ bgt outer_loop_8
+ b end_loops
+
+//height if 4 comes
+outer_loop_ht_4:
+
+ mov x7,x5
+
+prologue_ht_4:
+
+inner_loop_ht_4:
+
+ mov x12,x9
+ mov x4,x1
+
+ sub x8, x2, #6
+
+ ld1 {v0.2s},[x12],x11 //(1)vector load pu1_src
+ ld1 {v1.2s},[x12],x11 //(1)vector load pu1_src
+ ld1 {v2.2s},[x12],x11 //(1)vector load pu1_src
+ //ld1 {v3.2s},[x12],x2 //(1)vector load pu1_src
+ ld1 {v3.2s},[x12],x8 //(1)vector load pu1_src
+
+ //sub x12, x12, #6 //(1)
+
+ ld1 {v4.2s},[x12],x11 //(2)vector load pu1_src
+ ld1 {v5.2s},[x12],x11 //(2)vector load pu1_src
+ ld1 {v6.2s},[x12],x11 //(2)vector load pu1_src
+ //ld1 {v7.2s},[x12],x2 //(2)vector load pu1_src
+ ld1 {v7.2s},[x12],x8 //(2)vector load pu1_src
+
+ //sub x12, x12, #6 //(2)
+
+ ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src
+ umull v8.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src
+ umlsl v8.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src
+ umlal v8.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ //ld1 {v17.2s},[x12],x2 //(3)vector load pu1_src
+ ld1 {v17.2s},[x12],x8 //(3)vector load pu1_src
+ umlsl v8.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ //sub x12, x12, #6 //(3)
+ umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v18.2s},[x12],x11 //(4)vector load pu1_src
+ umlsl v10.8h, v4.8b, v24.8b //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v19.2s},[x12],x11 //(4)vector load pu1_src
+ umlal v10.8h, v6.8b, v26.8b //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 {v20.2s},[x12],x11 //(4)vector load pu1_src
+ umlsl v10.8h, v7.8b, v27.8b //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 {v21.2s},[x12],x2 //(4)vector load pu1_src
+ sqrshrun v8.8b, v8.8h,#6 //(1)right shift and saturating narrow result 1
+
+ add x9,x9,#8 //(core loop)
+
+ subs x7,x7,#8 //(prologue)decrement the wd loop
+ beq epilogue
+
+core_loop:
+ mov x12,x9
+
+ ld1 {v0.2s},[x12],x11 //(1_1)vector load pu1_src
+ umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v1.2s},[x12],x11 //(1_1)vector load pu1_src
+ umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v2.2s},[x12],x11 //(1_1)vector load pu1_src
+ umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ //ld1 {v3.2s},[x12],x2 //(1_1)vector load pu1_src
+ ld1 {v3.2s},[x12],x8 //(1_1)vector load pu1_src
+ umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ //sub x12, x12, #6 //(1_1)
+
+ st1 {v8.8b},[x4],x3 //(1)store the result pu1_dst
+ sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2
+
+ ld1 {v4.2s},[x12],x11 //(2_1)vector load pu1_src
+ umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v5.2s},[x12],x11 //(2_1)vector load pu1_src
+ umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v6.2s},[x12],x11 //(2_1)vector load pu1_src
+ umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ //ld1 {v7.2s},[x12],x2 //(2_1)vector load pu1_src
+ ld1 {v7.2s},[x12],x8 //(2_1)vector load pu1_src
+ umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ //sub x12, x12, #6 //(2_1)
+
+ st1 {v10.8b},[x4],x3 //(2)store the result pu1_dst
+ sqrshrun v12.8b, v12.8h,#6 //(3)right shift and saturating narrow result 1
+
+ ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src
+ umull v8.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src
+ umlsl v8.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src
+ umlal v8.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ //ld1 {v17.2s},[x12],x2 //(3_1)vector load pu1_src
+ ld1 {v17.2s},[x12],x8 //(3_1)vector load pu1_src
+ umlsl v8.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ //sub x12, x12, #6 //(3_1)
+
+ st1 {v12.8b},[x4],x3 //(3)store the result pu1_dst
+ sqrshrun v22.8b, v22.8h,#6 //(4)right shift and saturating narrow result 2
+
+ add x9,x9,#8 //(core loop)
+
+ umull v10.8h, v5.8b, v25.8b //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v18.2s},[x12],x11 //(4_1)vector load pu1_src
+
+ ld1 {v19.2s},[x12],x11 //(4_1)vector load pu1_src
+ umlsl v10.8h, v4.8b, v24.8b //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v20.2s},[x12],x11 //(4_1)vector load pu1_src
+ umlal v10.8h, v6.8b, v26.8b //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 {v21.2s},[x12],x2 //(4_1)vector load pu1_src
+ umlsl v10.8h, v7.8b, v27.8b //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ add x1,x1,#8 //(core loop)
+
+ subs x7,x7,#8 //(core loop)
+
+ st1 {v22.8b},[x4], x3 //(4)store the result pu1_dst
+ sqrshrun v8.8b, v8.8h,#6 //(1_1)right shift and saturating narrow result 1
+
+ mov x4, x1 //(core loop)
+
+ bgt core_loop //loopback
+
+epilogue:
+ umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v8.8b},[x4],x3 //(1)store the result pu1_dst
+ sqrshrun v10.8b, v10.8h,#6 //(2)right shift and saturating narrow result 2
+
+ umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v10.8b},[x4],x3 //(2)store the result pu1_dst
+ sqrshrun v12.8b, v12.8h,#6 //(3)right shift and saturating narrow result 1
+
+ st1 {v12.8b},[x4],x3 //(3)store the result pu1_dst
+
+ add x1,x1,#8 //(core loop)
+
+ sqrshrun v22.8b, v22.8h,#6 //(4)right shift and saturating narrow result 2
+
+
+ st1 {v22.8b},[x4], x3 //(4)store the result pu1_dst
+
+ sub x9,x9,x5
+ subs x14,x14,#4 //decrement the ht loop
+ sub x1,x1,x5
+ add x9,x9,x2,lsl #2
+ add x1,x1,x3,lsl #2
+ bgt outer_loop_ht_4
+ b end_loops
+
+outer_loop_4:
+ add x6,x1,x3 //pu1_dst + dst_strd
+ mov x7,x5
+ add x4,x12,x2 //pu1_src + src_strd
+
+inner_loop_4:
+ //ld1 {v0.2s, v1.2s},[x12] //vector load pu1_src
+
+ ld1 {v20.2s},[x12],x11 //vector load pu1_src
+ ld1 {v21.2s},[x12],x11 //vector load pu1_src
+ ld1 {v22.2s},[x12],x11 //vector load pu1_src
+ ld1 {v23.2s},[x12] //vector load pu1_src
+
+ sub x12,x12,#2 //increment the input pointer
+ ld1 {v16.2s},[x4],x11 //vector load pu1_src
+ ld1 {v17.2s},[x4],x11 //vector load pu1_src
+ ld1 {v18.2s},[x4],x11 //vector load pu1_src
+ ld1 {v19.2s},[x4] //vector load pu1_src
+ //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+ //ld1 {v12.2s, v13.2s},[x4] //vector load pu1_src + src_strd
+ //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+
+ sub x4,x4,#2 //increment the input pointer
+ //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2]
+ //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+
+ zip1 v0.2s, v20.2s, v16.2s
+ zip2 v4.2s, v20.2s, v16.2s //vector zip the i iteration and ii interation in single register
+ zip1 v1.2s, v21.2s, v17.2s
+ zip2 v5.2s, v21.2s, v17.2s
+ zip1 v2.2s, v22.2s, v18.2s
+ zip2 v6.2s, v22.2s, v18.2s
+ zip1 v3.2s, v23.2s, v19.2s
+ zip2 v7.2s, v23.2s, v19.2s
+
+ umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
+ umlsl v8.8h, v0.8b, v24.8b
+ umlal v8.8h, v2.8b, v26.8b
+ umlsl v8.8h, v3.8b, v27.8b
+
+ sqrshrun v8.8b, v8.8h,#6 //narrow right shift and saturating the result
+ st1 {v8.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register
+ subs x7,x7,#4 //decrement the wd by 4
+
+ st1 {v8.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register
+
+ bgt inner_loop_4
+
+ sub x12,x12,x5
+ subs x14,x14,#2 //decrement the ht by 2
+ sub x1,x1,x5
+ add x12,x12,x2,lsl #1
+ add x1,x1,x3,lsl #1
+ bgt outer_loop_4
+
+end_loops:
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
new file mode 100644
index 0000000..a35fdaa
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
@@ -0,0 +1,798 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_inter_pred_chroma_horz_neon.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs / akshaya mukund
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* chroma interprediction filter to store horizontal 16bit ouput
+//*
+//* //par description:
+//* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+//* to the elements pointed by 'pu1_src' and writes to the location pointed
+//* by 'pu1_dst' no downshifting or clipping is done and the output is used
+//* as an input for vertical filtering or weighted prediction
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pi2_dst
+//* word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
+// word16 *pi2_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 => src_strd
+//x3 => dst_strd
+
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_horz_w16out_av8
+
+
+.type ihevc_inter_pred_chroma_horz_w16out_av8, %function
+
+ihevc_inter_pred_chroma_horz_w16out_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x4,x15 //loads pi1_coeff
+ mov x6,x16 //loads ht
+ mov x10,x17 //loads wd
+
+ ld1 {v0.8b},[x4] //coeff = vld1_s8(pi1_coeff)
+ subs x14,x6,#0 //checks for ht == 0
+ abs v2.8b, v0.8b //vabs_s8(coeff)
+
+//******* added
+ mov x11, #2
+//******* added ends
+
+ ble end_loops
+
+ dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub x12,x0,#2 //pu1_src - 2
+ dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
+ dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+
+ tst x10,#3 //checks wd for multiples of 4
+ lsl x5, x10, #1 //2wd
+
+ dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+
+ and x7,x14,#1 //added //calculating ht_residue ht_residue = (ht & 1)
+ sub x14,x14,x7 //added //decrement height by ht_residue(residue value is calculated outside)
+
+ bne outer_loop_4 // this branching happens when the width is 2 or 6
+
+ cmp x10,#12
+ beq skip_16
+
+ cmp x10,#8
+ bge outer_loop_16
+
+skip_16:
+ tst x6,#3
+
+//******* removal
+ //mov x11,#8
+//******* removal ends
+
+ sub x9,x0,#2
+ beq outer_loop_ht_4 //this branching happens when the height is a a multiple of 4
+
+
+
+// cmp x10,#12
+// beq outer_loop_8
+// cmp x10,#16
+// bge outer_loop_16
+ b outer_loop_8
+
+
+
+outer_loop_16:
+ add x4,x12,x2
+
+
+ and x0, x12, #31
+ add x20,x12, x2 , lsl #1
+ prfm PLDL1KEEP,[x20]
+
+
+
+
+
+
+ add x19,x12,#8
+ ld1 { v0.2s},[x12],x11 //vector load pu1_src
+ ld1 { v1.2s},[x19],x11 //vector load pu1_src
+ mov x10,x5 //2wd
+ mul x14, x14 , x10
+ ld1 { v2.2s},[x12],x11 //vector load pu1_src
+ ld1 { v3.2s},[x19],x11 //vector load pu1_src
+ add x20,x4, x2 , lsl #1
+ prfm PLDL1KEEP,[x20]
+ mov x9,#10
+ ld1 { v4.2s},[x12],x11 //vector load pu1_src
+ ld1 { v5.2s},[x19],x11 //vector load pu1_src
+ sub x20,x3,#8
+ neg x6, x20
+ sub x8,x3,#8
+ ld1 { v6.2s},[x12],x9 //vector load pu1_src
+ ld1 { v7.2s},[x19],x9 //vector load pu1_src
+
+
+ add x19,x4,#8
+ umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v9.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 { v10.2s},[x4],x11 //vector load pu1_src
+ ld1 { v11.2s},[x19],x11 //vector load pu1_src
+
+ umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 { v12.2s},[x4],x11 //vector load pu1_src
+ ld1 { v13.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 { v14.4s},[x4],x9 //vector load pu1_src
+ ld1 { v15.2s},[x19],x9 //vector load pu1_src
+
+ umull v28.8h, v3.8b, v25.8b
+ lsl x6,x6,#1
+ sub x20,x5,x3,lsl #1
+ neg x3, x20
+ umlsl v28.8h, v1.8b, v24.8b
+ lsl x8,x8,#1
+ sub x20,x5,x2,lsl #1
+ neg x7, x20
+ umlal v28.8h, v5.8b, v26.8b
+
+ umlsl v28.8h, v7.8b, v27.8b
+ cmp x14,#32
+ beq epilog_end
+ sub x14, x14,#64
+
+inner_loop_16:
+
+ // and x7, x12, #31 //decrement the wd loop
+ // cmp x7, x0
+ add x20,x12, x2 , lsl #2
+ prfm PLDL1KEEP,[x20]
+ add x20,x4, x2 , lsl #2
+ prfm PLDL1KEEP,[x20]
+
+
+ subs x10,x10,#16
+
+ umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+
+
+// add x20,x12,x2,lsl #1
+ //csel x12, x20, x12,eq
+// sub x20,x12,x5
+ //csel x12, x20, x12,eq
+ add x20,x12,x7
+ csel x12, x20, x12,eq
+ add x20,x12,x2
+ csel x4, x20, x4,eq
+
+
+ st1 { v30.8h}, [x1],#16
+ umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+
+
+
+ add x19,x12,#8
+ ld1 { v0.2s},[x12],x11 //vector load pu1_src
+ ld1 { v1.2s},[x19],x11 //vector load pu1_src
+ umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+
+
+
+ ld1 { v2.2s},[x12],x11 //vector load pu1_src
+ ld1 { v3.2s},[x19],x11 //vector load pu1_src
+ umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+ ld1 { v4.2s},[x12],x11 //vector load pu1_src
+ ld1 { v5.2s},[x19],x11 //vector load pu1_src
+ umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ st1 { v28.8h}, [x1],x8
+ umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 { v6.2s},[x12],x9 //vector load pu1_src
+ ld1 { v7.2s},[x19],x9 //vector load pu1_src
+ umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ add x19,x4,#8
+ ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v9.2s},[x19],x11 //vector load pu1_src
+ umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+ ld1 { v10.2s},[x4],x11 //vector load pu1_src
+ ld1 { v11.2s},[x19],x11 //vector load pu1_src
+ umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 { v12.2s},[x4],x11 //vector load pu1_src
+ ld1 { v13.2s},[x19],x11 //vector load pu1_src
+ umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 { v14.2s},[x4],x9 //vector load pu1_src
+ ld1 { v15.2s},[x19],x9 //vector load pu1_src
+ umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ st1 { v22.8h},[x1],#16 //store the result pu1_dst
+ umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ csel x10, x5, x10,eq //2wd
+ umull v28.8h, v3.8b, v25.8b
+
+
+
+ umlsl v28.8h, v1.8b, v24.8b
+ st1 { v20.8h},[x1],x6 //store the result pu1_dst
+
+
+ add x20,x1,x3,lsl #1
+ csel x1, x20, x1,eq
+ umlal v28.8h, v5.8b, v26.8b
+
+ subs x14,x14,#32 //decrement the ht loop
+ umlsl v28.8h, v7.8b, v27.8b
+
+
+
+// mov x0, x7
+ bgt inner_loop_16
+
+
+
+ add x14,x14,#64
+ cmp x14,#32
+ beq epilog_end
+
+epilog:
+
+ st1 { v30.8h}, [x1],#16
+ umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ st1 { v28.8h}, [x1],x8
+
+
+
+ umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ subs x10,x10,#16 //decrement the wd loop
+ umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+// add x20,x12,x2,lsl #1
+ //csel x12, x20, x12,eq
+ add x20,x12,x7
+ csel x12, x20, x12,eq
+ umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ // sub x20,x12,x5
+ //csel x12, x20, x12,eq
+ csel x10, x5, x10,eq //2wd
+ add x20,x12,x2
+ csel x4, x20, x4,eq
+ umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ add x19,x12,#8
+ ld1 { v0.2s},[x12],x11 //vector load pu1_src
+ ld1 { v1.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 { v2.2s},[x12],x11 //vector load pu1_src
+ ld1 { v3.2s},[x19],x11 //vector load pu1_src
+
+ umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 { v4.2s},[x12],x11 //vector load pu1_src
+ ld1 { v5.2s},[x19],x11 //vector load pu1_src
+
+ umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 { v6.2s},[x12],x9 //vector load pu1_src
+ ld1 { v7.2s},[x19],x9 //vector load pu1_src
+ umull v30.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ add x19,x4,#8
+ ld1 { v8.2s},[x4],x11 //vector load pu1_src
+ ld1 { v9.2s},[x19],x11 //vector load pu1_src
+ umlsl v30.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 { v10.2s},[x4],x11 //vector load pu1_src
+ ld1 { v11.2s},[x19],x11 //vector load pu1_src
+ umlal v30.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ umlsl v30.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 { v12.2s},[x4],x11 //vector load pu1_src
+ ld1 { v13.2s},[x19],x11 //vector load pu1_src
+ umull v28.8h, v3.8b, v25.8b
+
+ ld1 { v14.2s},[x4],x9 //vector load pu1_src
+ ld1 { v15.2s},[x19],x9 //vector load pu1_src
+
+ umlsl v28.8h, v1.8b, v24.8b
+ st1 { v22.8h},[x1],#16 //store the result pu1_dst
+ umlal v28.8h, v5.8b, v26.8b
+ st1 { v20.8h},[x1],x6 //store the result pu1_dst
+ umlsl v28.8h, v7.8b, v27.8b
+ add x20,x1,x3,lsl #1
+ csel x1, x20, x1,eq
+
+
+epilog_end:
+
+ umull v22.8h, v10.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v22.8h, v8.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v22.8h, v12.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v22.8h, v14.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+ umull v20.8h, v11.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v20.8h, v9.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v20.8h, v13.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v20.8h, v15.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+ st1 { v30.8h}, [x1],#16
+ st1 { v28.8h}, [x1],x8
+ st1 { v22.8h},[x1],#16 //store the result pu1_dst
+ st1 { v20.8h},[x1],x6 //store the result pu1_dst
+
+
+ mov x6,x16 //loads ht
+
+ and x7,x6,#1
+
+ cmp x7,#0
+ mov x10,x5
+ add x20,x12,x2,lsl #1
+ csel x12, x20, x12,ne
+ sub x20,x12,x5
+ csel x12, x20, x12,ne
+ add x20,x1,x3,lsl #1
+ csel x1, x20, x1,ne
+
+
+ bgt loop_residue_4
+
+ b end_loops
+
+
+
+
+outer_loop_8:
+
+ add x6,x1,x3,lsl #1 //pu1_dst + dst_strd
+ mov x10,x5 //2wd
+ add x4,x12,x2 //pu1_src + src_strd
+
+inner_loop_8:
+ //ld1 {v0.2s, v1.2s},[x12],x11 //vector load pu1_src
+ ld1 {v0.2s},[x12],x11 //vector load pu1_src
+ ld1 {v1.2s},[x12],x11 //vector load pu1_src
+ ld1 {v2.2s},[x12],x11 //vector load pu1_src
+ ld1 {v3.2s},[x12],x11 //vector load pu1_src
+
+
+ //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ umull v8.8h, v1.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+ //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+ umlal v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v8.8h, v3.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ //ld1 {v12.2s, v13.2s},[x4],x11 //vector load pu1_src + src_strd
+ ld1 {v4.2s},[x4],x11 //vector load pu1_src
+ ld1 {v5.2s},[x4],x11 //vector load pu1_src
+ ld1 {v6.2s},[x4],x11 //vector load pu1_src
+ ld1 {v7.2s},[x4],x11 //vector load pu1_src
+ //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2]
+ umull v10.8h, v5.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v10.8h, v4.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+ umlal v10.8h, v6.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlsl v10.8h, v7.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v8.8h}, [x1],#16
+
+ subs x10,x10,#8 //decrement the wd loop
+ st1 {v10.8h},[x6],#16 //store the result pu1_dst
+ bgt inner_loop_8
+
+ sub x12,x12,x5
+ subs x14,x14,#2 //decrement the ht loop
+ sub x1,x1,x5,lsl #1
+ add x12,x12,x2,lsl #1
+ add x1,x1,x3,lsl #2
+ bgt outer_loop_8
+
+ cmp x7,#0
+ mov x10,x5
+ bgt loop_residue_4
+
+ b end_loops
+
+
+
+//height if 4 comes
+outer_loop_ht_4:
+
+ mov x10,x5
+
+prologue_ht_4:
+ lsl x8, x3, #1
+
+inner_loop_ht_4:
+
+ mov x12,x9
+ mov x4,x1
+
+ sub x0, x2, #6 // not sure if x0 needs to be preserved
+
+ ld1 {v0.2s},[x12],x11 //(1)vector load pu1_src
+ ld1 {v1.2s},[x12],x11 //(1)vector load pu1_src
+ ld1 {v2.2s},[x12],x11 //(1)vector load pu1_src
+ ld1 {v3.2s},[x12],x0 //(1)vector load pu1_src
+
+ ld1 {v4.2s},[x12],x11 //(2)vector load pu1_src
+ ld1 {v5.2s},[x12],x11 //(2)vector load pu1_src
+ ld1 {v6.2s},[x12],x11 //(2)vector load pu1_src
+ ld1 {v7.2s},[x12],x0 //(2)vector load pu1_src
+
+ ld1 {v14.2s},[x12],x11 //(3)vector load pu1_src
+ umull v8.8h, v1.8b, v25.8b //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v15.2s},[x12],x11 //(3)vector load pu1_src
+ umlsl v8.8h, v0.8b, v24.8b //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v16.2s},[x12],x11 //(3)vector load pu1_src
+ umlal v8.8h, v2.8b, v26.8b //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 {v17.2s},[x12],x0 //(3)vector load pu1_src
+ umlsl v8.8h, v3.8b, v27.8b //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 {v18.2s},[x12],x11 //(4)vector load pu1_src
+ umull v10.8h, v5.8b, v25.8b //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v19.2s},[x12],x11 //(4)vector load pu1_src
+ umlsl v10.8h, v4.8b, v24.8b //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v20.2s},[x12],x11 //(4)vector load pu1_src
+ umlal v10.8h, v6.8b, v26.8b //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 {v21.2s},[x12],x2 //(4)vector load pu1_src
+ umlsl v10.8h, v7.8b, v27.8b //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ add x9,x9,#8 //(core loop)
+
+ subs x10,x10,#8 //(prologue)decrement the wd loop
+ beq epilogue
+
+core_loop:
+ st1 {v8.8h},[x4],x8 //(1)store the result pu1_dst
+ mov x12,x9
+
+ ld1 {v0.2s},[x12],x11 //(1_1)vector load pu1_src
+ umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v1.2s},[x12],x11 //(1_1)vector load pu1_src
+ umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v2.2s},[x12],x11 //(1_1)vector load pu1_src
+ umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 {v3.2s},[x12],x0 //(1_1)vector load pu1_src
+ umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v10.8h},[x4],x8 //(2)store the result pu1_dst
+ add x9,x9,#8 //(core loop)
+
+ ld1 {v4.2s},[x12],x11 //(2_1)vector load pu1_src
+ umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v5.2s},[x12],x11 //(2_1)vector load pu1_src
+ umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v6.2s},[x12],x11 //(2_1)vector load pu1_src
+ umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 {v7.2s},[x12],x0 //(2_1)vector load pu1_src
+ umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v12.8h},[x4],x8 //(3)store the result pu1_dst
+ add x1,x1,#16 //(core loop)
+
+ ld1 {v14.2s},[x12],x11 //(3_1)vector load pu1_src
+ umull v8.8h, v1.8b, v25.8b //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ ld1 {v15.2s},[x12],x11 //(3_1)vector load pu1_src
+ umlsl v8.8h, v0.8b, v24.8b //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v16.2s},[x12],x11 //(3_1)vector load pu1_src
+ umlal v8.8h, v2.8b, v26.8b //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ ld1 {v17.2s},[x12],x0 //(3_1)vector load pu1_src
+ umlsl v8.8h, v3.8b, v27.8b //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v22.8h}, [x4], x8 //(4)store the result pu1_dst
+ subs x10,x10,#8 //(core loop)
+
+ umull v10.8h, v5.8b, v25.8b //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v18.2s},[x12],x11 //(4_1)vector load pu1_src
+
+ ld1 {v19.2s},[x12],x11 //(4_1)vector load pu1_src
+ umlsl v10.8h, v4.8b, v24.8b //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ ld1 {v20.2s},[x12],x11 //(4_1)vector load pu1_src
+ umlal v10.8h, v6.8b, v26.8b //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ mov x4, x1 //(core loop)
+
+ ld1 {v21.2s},[x12],x0 //(4_1)vector load pu1_src
+ umlsl v10.8h, v7.8b, v27.8b //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+
+ bgt core_loop //loopback
+
+epilogue:
+ umull v12.8h, v15.8b, v25.8b //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ umlsl v12.8h, v14.8b, v24.8b //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ umlal v12.8h, v16.8b, v26.8b //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ umlsl v12.8h, v17.8b, v27.8b //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v8.8h},[x4], x8 //(1)store the result pu1_dst
+
+ umull v22.8h, v19.8b, v25.8b //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ umlsl v22.8h, v18.8b, v24.8b //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ umlal v22.8h, v20.8b, v26.8b //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ umlsl v22.8h, v21.8b, v27.8b //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ st1 {v10.8h},[x4], x8 //(2)store the result pu1_dst
+
+ st1 {v12.8h},[x4], x8 //(3)store the result pu1_dst
+
+ add x1,x1,#16 //(core loop)
+
+ st1 {v22.8h},[x4], x8 //(4)store the result pu1_dst
+
+ sub x9,x9,x5
+ subs x14,x14,#4 //decrement the ht loop
+ sub x1,x1,x5,lsl #1
+ add x9,x9,x2,lsl #2
+ add x1,x1,x3,lsl #3
+ bgt outer_loop_ht_4
+
+ cmp x7,#0
+ mov x10,x5
+ csel x12, x9, x12,gt
+ csel x4, x1, x4,gt
+ bgt loop_residue_4
+
+ b end_loops
+
+outer_loop_4:
+ add x6,x1,x3,lsl #1 //pu1_dst + dst_strd
+ mov x10,x5
+ add x4,x12,x2 //pu1_src + src_strd
+
+inner_loop_4:
+ //ld1 {v0.2s, v1.2s},[x12] //vector load pu1_src
+ ld1 {v20.2s},[x12],x11 //vector load pu1_src
+ ld1 {v21.2s},[x12],x11 //vector load pu1_src
+ ld1 {v22.2s},[x12],x11 //vector load pu1_src
+ ld1 {v23.2s},[x12] //vector load pu1_src
+
+//**** removal
+ //add x12,x12,#4 //increment the input pointer
+//**** removal ends
+//**** addn
+ sub x12,x12,#2 //increment the input pointer
+//**** addn ends
+ ld1 {v16.2s},[x4],x11 //vector load pu1_src
+ ld1 {v17.2s},[x4],x11 //vector load pu1_src
+ ld1 {v18.2s},[x4],x11 //vector load pu1_src
+ ld1 {v19.2s},[x4] //vector load pu1_src
+ //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+ //ld1 {v12.2s, v13.2s},[x4] //vector load pu1_src + src_strd
+ //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+
+ //add x4,x4,#4 //increment the input pointer
+ sub x4,x4,#2
+ //vext.u8 d14,d12,d13,#2 //vector extract of src[0_2]
+ //vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+
+//**** removal
+ //zip1 v0.2s, v0.2s, v12.2s
+ //zip2 v12.2s, v0.2s, v12.2s //vector zip the i iteration and ii interation in single register
+ //zip1 v2.2s, v2.2s, v14.2s
+ //zip2 v14.2s, v2.2s, v14.2s
+ //zip1 v4.2s, v4.2s, v16.2s
+ //zip2 v16.2s, v4.2s, v16.2s
+ //zip1 v6.2s, v6.2s, v18.2s
+ //zip2 v18.2s, v6.2s, v18.2s
+//**** removal ends
+//**** addn
+ zip1 v0.2s, v20.2s, v16.2s
+ zip2 v4.2s, v20.2s, v16.2s //vector zip the i iteration and ii interation in single register
+ zip1 v1.2s, v21.2s, v17.2s
+ zip2 v5.2s, v21.2s, v17.2s
+ zip1 v2.2s, v22.2s, v18.2s
+ zip2 v6.2s, v22.2s, v18.2s
+ zip1 v3.2s, v23.2s, v19.2s
+ zip2 v7.2s, v23.2s, v19.2s
+//**** addn ends
+
+ umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
+ umlsl v8.8h, v0.8b, v24.8b
+ umlal v8.8h, v2.8b, v26.8b
+ umlsl v8.8h, v3.8b, v27.8b
+
+ st1 {v8.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register
+ subs x10,x10,#4 //decrement the wd by 4
+
+ st1 {v8.d}[1],[x6],#8 //store the ii iteration result which is in lower part of the register
+
+ bgt inner_loop_4
+
+ sub x12,x12,x5
+ subs x14,x14,#2 //decrement the ht by 2
+ sub x1,x1,x5,lsl #1
+ add x12,x12,x2,lsl #1
+ add x1,x1,x3,lsl #2
+ bgt outer_loop_4
+
+ cmp x7,#0
+ mov x10,x5
+ beq end_loops
+
+loop_residue_4:
+
+ mov x10,x5 //2wd
+
+loop_residue:
+
+ //ld1 {v0.2s, v1.2s},[x12] //vector load pu1_src
+ ld1 {v20.2s},[x12],x11 //vector load pu1_src
+ ld1 {v21.2s},[x12],x11 //vector load pu1_src
+ ld1 {v22.2s},[x12],x11 //vector load pu1_src
+ ld1 {v23.2s},[x12] //vector load pu1_src
+ //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ //umull v8.8h, v2.8b, v25.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ //umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+ //add x12,x12,#4 //pu1_src + 4
+ sub x12, x12, #2
+ //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+ //umlal v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ //umlsl v8.8h, v6.8b, v27.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ umull v8.8h, v21.8b, v25.8b
+ umlsl v8.8h, v20.8b, v24.8b
+ umlal v8.8h, v22.8b, v26.8b
+ umlsl v8.8h, v23.8b, v27.8b
+
+ st1 {v8.1d},[x1] //store the result pu1_dst
+ subs x10,x10,#4 //decrement the wd loop
+ add x1,x1,#8 //pi2_dst + 8
+
+ bgt loop_residue //loop again
+
+ //inner loop ends
+ //add x8,x3,lsl #1 //2*dst_strd
+ //sub x8,x8,x5,lsl #1 //2*dst_strd - 2wd
+ //sub x9,x2,x5 //src_strd - 2wd
+ //subs x7,x7,#1 //decrement the ht loop
+ //add x12,x12,x9 //pu1_src + src_strd
+ //add x1,x1,x8 //pu1_dst + 2*dst_strd
+ //bgt outer_loop_residue_4 //loop again
+ //b end_loops //jumps to end
+
+end_loops:
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert.s b/common/arm64/ihevc_inter_pred_chroma_vert.s
new file mode 100644
index 0000000..2de789f
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert.s
@@ -0,0 +1,405 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_inter_pred_chroma_vert_neon.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* chroma interprediction filter for vertical input
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' the output is down shifted by 6 and clipped to 8 bits
+//* assumptions : the function is optimized considering the fact width is
+//* multiple of 2,4 or 8. and also considering height should be multiple of 2
+//* width 4,8 is optimized further
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 => src_strd
+//x3 => dst_strd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_av8
+
+.type ihevc_inter_pred_chroma_vert_av8, %function
+
+ihevc_inter_pred_chroma_vert_av8:
+
+ // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x4,x16 //loads ht
+ mov x12,x15 //loads pi1_coeff
+ cmp x4,#0 //checks ht == 0
+ mov x6,x17 //loads wd
+ sub x0,x0,x2 //pu1_src - src_strd
+ ld1 {v0.8b},[x12] //loads pi1_coeff
+
+ ble end_loops //jumps to end
+
+ tst x6,#3 //checks (wd & 3)
+ abs v3.8b, v0.8b //vabs_s8(coeff)
+ lsl x10,x6,#1 //2*wd
+ dup v0.8b, v3.8b[0] //coeffabs_0
+ dup v1.8b, v3.8b[1] //coeffabs_1
+ dup v2.8b, v3.8b[2] //coeffabs_2
+ dup v3.8b, v3.8b[3] //coeffabs_3
+
+ bgt outer_loop_wd_2 //jumps to loop handling wd ==2
+
+ tst x4,#7 //checks ht for mul of 8
+ beq core_loop_ht_8 //when height is multiple of 8
+
+ lsl x7,x3,#1 //2*dst_strd
+ sub x9,x7,x10 //2*dst_strd - 2wd
+ lsl x12,x2,#1 //2*src_strd
+ sub x8,x12,x10 //2*src_strd - 2wd
+ mov x5,x10 //2wd
+
+inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2
+
+ add x6,x0,x2 //pu1_src +src_strd
+ ld1 {v9.8b},[x6],x2 //loads pu1_src
+ subs x5,x5,#8 //2wd - 8
+ ld1 {v5.8b},[x0],#8 //loads src
+ umull v6.8h, v9.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ ld1 {v4.8b},[x6],x2 //loads incremented src
+ umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+ ld1 {v8.8b},[x6],x2 //loads incremented src
+ umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+ umull v4.8h, v4.8b, v1.8b
+ umlsl v6.8h, v8.8b, v3.8b
+ umlsl v4.8h, v9.8b, v0.8b
+ ld1 {v10.8b},[x6] //loads the incremented src
+ umlal v4.8h, v8.8b, v2.8b
+ sqrshrun v6.8b, v6.8h,#6 //shifts right
+ umlsl v4.8h, v10.8b, v3.8b
+ add x6,x1,x3 //pu1_dst + dst_strd
+ sqrshrun v4.8b, v4.8h,#6 //shifts right
+ st1 {v6.8b},[x1],#8 //stores the loaded value
+
+ st1 {v4.8b},[x6] //stores the loaded value
+
+ bgt inner_loop_ht_2 //inner loop again
+
+ subs x4,x4,#2 //ht - 2
+ add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd)
+ mov x5,x10 //2wd
+ add x0,x0,x8 //pu1_src += (2*src_strd - 2wd)
+
+ bgt inner_loop_ht_2 //loop again
+
+ b end_loops //jumps to end
+
+outer_loop_wd_2: //called when width is multiple of 2
+ lsl x5,x3,#1 //2*dst_strd
+ mov x12,x10 //2wd
+ sub x9,x5,x10 //2*dst_strd - 2wd
+ lsl x7,x2,#1 //2*src_strd
+ sub x8,x7,x10 //2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+ add x6,x0,x2 //pu1_src + src_strd
+ ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+ subs x12,x12,#4 //2wd - 4
+ add x0,x0,#4 //pu1_src + 4
+ ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp
+ dup v7.2s, v6.2s[1]
+ ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp
+ umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ dup v7.2s, v7.2s[1]
+ ld1 {v7.s}[1],[x6],x2
+ umlsl v4.8h, v6.8b, v0.8b
+ umlal v4.8h, v7.8b, v2.8b
+ dup v7.2s, v7.2s[1]
+ ld1 {v7.s}[1],[x6]
+ add x6,x1,x3 //pu1_dst + dst_strd
+ umlsl v4.8h, v7.8b, v3.8b
+ sqrshrun v4.8b, v4.8h,#6 //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
+ st1 {v4.s}[0],[x1] //stores the loaded value
+ add x1,x1,#4 //pu1_dst += 4
+ st1 {v4.s}[1],[x6] //stores the loaded value
+
+ bgt inner_loop_wd_2 //inner loop again
+
+ //inner loop ends
+ subs x4,x4,#2 //ht - 2
+ add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd
+ mov x12,x10 //2wd
+ add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd
+
+ bgt inner_loop_wd_2 //loop again
+
+ b end_loops //jumps to end
+
+core_loop_ht_8: //when wd & ht is multiple of 8
+
+ lsl x12,x3,#2 //4*dst_strd
+ sub x8,x12,x10 //4*dst_strd - 2wd
+ lsl x12,x2,#2 //4*src_strd
+ sub x9,x12,x10 //4*src_strd - 2wd
+
+ bic x5,x10,#7 //x5 ->wd
+ lsr x14, x10, #3 //divide by 8
+ mul x12, x4 , x14 //multiply height by width
+ sub x12, x12,#4 //subtract by one for epilog
+
+prolog:
+ add x6,x0,x2 //pu1_src + src_strd
+ ld1 {v5.8b},[x6],x2 //loads pu1_src
+ subs x5,x5,#8 //2wd - 8
+ ld1 {v4.8b},[x0],#8 //loads the source
+ ld1 {v6.8b},[x6],x2 //load and increment
+ umull v30.8h, v5.8b, v1.8b //mul with coeff 1
+ ld1 {v7.8b},[x6],x2 //load and increment
+ umlsl v30.8h, v4.8b, v0.8b
+ add x7,x1,x3 //pu1_dst
+ umlal v30.8h, v6.8b, v2.8b
+ umlsl v30.8h, v7.8b, v3.8b
+ ld1 {v8.8b},[x6],x2 //load and increment
+
+ umull v28.8h, v6.8b, v1.8b //mul_res 2
+ add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
+ csel x0, x20, x0,le
+ umlsl v28.8h, v5.8b, v0.8b
+ bic x20,x10,#7 //x5 ->wd
+ csel x5, x20, x5,le
+ umlal v28.8h, v7.8b, v2.8b
+ ld1 {v9.8b},[x6],x2
+ umlsl v28.8h, v8.8b, v3.8b
+ sqrshrun v30.8b, v30.8h,#6
+
+ ld1 {v10.8b},[x6],x2
+ umull v26.8h, v7.8b, v1.8b
+ add x6,x0,x2 //pu1_src + src_strd
+ umlsl v26.8h, v6.8b, v0.8b
+ st1 {v30.8b},[x1],#8 //stores the loaded value
+ umlal v26.8h, v8.8b, v2.8b
+ ld1 {v4.8b},[x0],#8 //loads the source
+ umlsl v26.8h, v9.8b, v3.8b
+ sqrshrun v28.8b, v28.8h,#6
+
+ add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
+ csel x1, x20, x1,le
+ umull v24.8h, v8.8b, v1.8b
+ ld1 {v5.8b},[x6],x2 //loads pu1_src
+ umlsl v24.8h, v7.8b, v0.8b
+ subs x12,x12,#4
+ ld1 {v6.8b},[x6],x2 //load and increment
+ umlal v24.8h, v9.8b, v2.8b
+ ld1 {v7.8b},[x6],x2 //load and increment
+ umlsl v24.8h, v10.8b, v3.8b
+
+ lsl x11,x2,#2
+ st1 {v28.8b},[x7],x3 //stores the loaded value
+ sqrshrun v26.8b, v26.8h,#6
+ sub x20,x2,x2,lsl #3
+ neg x11, x20
+ add x14,x2,x2,lsl #1
+ add x14,x14,x11
+ ble epilog //jumps to epilog
+
+kernel_8:
+
+ umull v30.8h, v5.8b, v1.8b //mul with coeff 1
+ subs x5,x5,#8 //2wd - 8
+ umlsl v30.8h, v4.8b, v0.8b
+ add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
+ csel x0, x20, x0,le
+ umlal v30.8h, v6.8b, v2.8b
+ lsl x20,x2,#3
+ sub x20,x20,x2
+ csel x11,x20,x11,le
+ //rsble x11,x2,x2,lsl #3
+ umlsl v30.8h, v7.8b, v3.8b
+ st1 {v26.8b},[x7],x3 //stores the loaded value
+ sqrshrun v24.8b, v24.8h,#6
+
+ ld1 {v8.8b},[x6],x2 //load and increment
+
+ umull v28.8h, v6.8b, v1.8b //mul_res 2
+ bic x20,x10,#7 //x5 ->wd
+ csel x5, x20, x5,le
+ umlsl v28.8h, v5.8b, v0.8b
+ st1 {v24.8b},[x7],x3 //stores the loaded value
+
+ umlal v28.8h, v7.8b, v2.8b
+
+ ld1 {v9.8b},[x6],x2
+ sqrshrun v30.8b, v30.8h,#6
+
+ umlsl v28.8h, v8.8b, v3.8b
+ ld1 {v10.8b},[x6],x2
+ add x7,x1,x3 //pu1_dst
+ umull v26.8h, v7.8b, v1.8b
+ add x6,x0,x2 //pu1_src + src_strd
+
+ add x20,x0, x11
+ prfm PLDL1KEEP,[x20]
+
+
+ umlsl v26.8h, v6.8b, v0.8b
+ ld1 {v4.8b},[x0],#8 //loads the source
+
+ umlal v26.8h, v8.8b, v2.8b
+ st1 {v30.8b},[x1],#8 //stores the loaded value
+
+ umlsl v26.8h, v9.8b, v3.8b
+ ld1 {v5.8b},[x6],x2 //loads pu1_src
+
+ add x11,x11,x2
+ sqrshrun v28.8b, v28.8h,#6
+
+ umull v24.8h, v8.8b, v1.8b
+ ld1 {v6.8b},[x6],x2 //load and increment
+ add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
+ csel x1, x20, x1,le
+
+ cmp x11,x14
+ lsl x20,x2,#3
+ sub x20,x20,x2
+ csel x11,x20,x11,gt
+ //rsbgt x11,x2,x2,lsl #3
+
+ umlsl v24.8h, v7.8b, v0.8b
+ subs x12,x12,#4
+
+ umlal v24.8h, v9.8b, v2.8b
+ ld1 {v7.8b},[x6],x2 //load and increment
+
+ umlsl v24.8h, v10.8b, v3.8b
+ st1 {v28.8b},[x7],x3 //stores the loaded value
+ sqrshrun v26.8b, v26.8h,#6
+
+ bgt kernel_8 //jumps to kernel_8
+
+epilog:
+
+ umull v30.8h, v5.8b, v1.8b //mul with coeff 1
+ umlsl v30.8h, v4.8b, v0.8b
+ umlal v30.8h, v6.8b, v2.8b
+ umlsl v30.8h, v7.8b, v3.8b
+ st1 {v26.8b},[x7],x3 //stores the loaded value
+ sqrshrun v24.8b, v24.8h,#6
+
+ ld1 {v8.8b},[x6],x2 //load and increment
+ umull v28.8h, v6.8b, v1.8b //mul_res 2
+ umlsl v28.8h, v5.8b, v0.8b
+ umlal v28.8h, v7.8b, v2.8b
+ umlsl v28.8h, v8.8b, v3.8b
+ st1 {v24.8b},[x7],x3 //stores the loaded value
+ sqrshrun v30.8b, v30.8h,#6
+
+ ld1 {v9.8b},[x6],x2
+ umull v26.8h, v7.8b, v1.8b
+ add x7,x1,x3 //pu1_dst
+ umlsl v26.8h, v6.8b, v0.8b
+ st1 {v30.8b},[x1],#8 //stores the loaded value
+
+ sqrshrun v28.8b, v28.8h,#6
+ umlal v26.8h, v8.8b, v2.8b
+ ld1 {v10.8b},[x6],x2
+ umlsl v26.8h, v9.8b, v3.8b
+
+ umull v24.8h, v8.8b, v1.8b
+ sqrshrun v26.8b, v26.8h,#6
+ st1 {v28.8b},[x7],x3 //stores the loaded value
+ umlsl v24.8h, v7.8b, v0.8b
+ umlal v24.8h, v9.8b, v2.8b
+ st1 {v26.8b},[x7],x3 //stores the loaded value
+ umlsl v24.8h, v10.8b, v3.8b
+
+ sqrshrun v24.8b, v24.8h,#6
+ st1 {v24.8b},[x7],x3 //stores the loaded value
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
new file mode 100644
index 0000000..55e7f54
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
@@ -0,0 +1,356 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs / parthiban
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* chroma interprediction filter for 16bit vertical input.
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
+//* clipped to lie between 0 and 255 assumptions : the function is
+//* optimized considering the fact width and height are multiple of 2.
+//*
+//* //param[in] pi2_src
+//* word16 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 => src_strd
+//x3 => dst_strd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_av8
+
+.type ihevc_inter_pred_chroma_vert_w16inp_av8, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x4, x15 //loads pi1_coeff
+ mov x6, x17 //wd
+ lsl x2,x2,#1 //src_strd = 2* src_strd
+ mov x5,x16 //loads ht
+ ld1 {v0.8b},[x4] //loads pi1_coeff
+ sub x4,x0,x2 //pu1_src - src_strd
+ sxtl v0.8h, v0.8b //long the value
+
+ tst x6,#3 //checks wd == 2
+ dup v12.4h, v0.4h[0] //coeff_0
+ dup v13.4h, v0.4h[1] //coeff_1
+ dup v14.4h, v0.4h[2] //coeff_2
+ dup v15.4h, v0.4h[3] //coeff_3
+
+ bgt core_loop_ht_2 //jumps to loop handles wd 2
+
+ tst x5,#3 //checks ht == mul of 4
+ beq core_loop_ht_4 //jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+ lsl x7,x2,#1 //2*src_strd
+ lsl x12,x3,#1 //2*dst_strd
+ lsl x9,x6,#2 //4*wd
+ sub x6,x12,x6,lsl #1 //2*dst_strd - 2*wd
+ sub x8,x7,x9 //2*src_strd - 4*wd
+ mov x12,x9 //4wd
+
+inner_loop_ht_2:
+ add x0,x4,x2 //increments pi2_src
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ subs x12,x12,#8 //2wd + 8
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v3.4h},[x0],x2 //loads pi2_src
+ smlal v0.4s, v2.4h, v13.4h
+ ld1 {v6.4h},[x0],x2
+ smlal v8.4s, v3.4h, v13.4h
+ ld1 {v2.4h},[x0]
+ add x7,x1,x3 //pu1_dst + dst_strd
+ smlal v0.4s, v3.4h, v14.4h
+ smlal v8.4s, v6.4h, v14.4h
+ smlal v0.4s, v6.4h, v15.4h
+ smlal v8.4s, v2.4h, v15.4h
+ sqshrn v0.4h, v0.4s,#6 //right shift
+ sqshrn v30.4h, v8.4s,#6 //right shift
+ sqrshrun v0.8b, v0.8h,#6 //rounding shift
+ sqrshrun v30.8b, v30.8h,#6 //rounding shift
+ st1 {v0.s}[0],[x1],#4 //stores the loaded value
+ st1 {v30.s}[0],[x7] //stores the loaded value
+ bgt inner_loop_ht_2 //inner loop -again
+
+ //inner loop ends
+ subs x5,x5,#2 //increments ht
+ add x1,x1,x6 //pu1_dst += 2*dst_strd - 2*wd
+ mov x12,x9 //4wd
+ add x4,x4,x8 //pi1_src_tmp1 += 2*src_strd - 4*wd
+ bgt inner_loop_ht_2 //loop again
+
+ b end_loops //jumps to end
+
+core_loop_ht_4:
+ lsl x7,x2,#2 //2*src_strd
+ lsl x12,x3,#2 //2*dst_strd
+ lsr x11, x6, #1 //divide by 2
+ sub x14,x12,x6,lsl #1 //2*dst_strd - 2*wd
+ sub x8,x7,x6,lsl #2 //2*src_strd - 4*wd
+
+ mul x12, x5 , x11 //multiply height by width
+ sub x12, x12,#4 //subtract by one for epilog
+ lsl x11, x6, #1 //2*wd
+
+prolog:
+ add x0,x4,x2 //increments pi2_src
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ ld1 {v1.4h},[x0],x2 //loads pi2_src
+ subs x11,x11,#4
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ ld1 {v3.4h},[x0],x2
+ smlal v30.4s, v1.4h, v13.4h
+ smlal v30.4s, v2.4h, v14.4h
+ add x9,x1,x3 //pu1_dst + dst_strd
+ smlal v30.4s, v3.4h, v15.4h
+
+ ld1 {v4.4h},[x0],x2
+ smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ add x20,x4,x8
+ csel x4, x20, x4,le
+ smlal v28.4s, v2.4h, v13.4h
+ ld1 {v5.4h},[x0],x2
+ smlal v28.4s, v3.4h, v14.4h
+ ld1 {v6.4h},[x0],x2
+ smlal v28.4s, v4.4h, v15.4h
+ lsl x20,x6,#1
+ csel x11, x20, x11,le
+
+ sqshrn v30.4h, v30.4s,#6 //right shift
+
+ smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ add x0,x4,x2
+ smlal v26.4s, v3.4h, v13.4h
+ smlal v26.4s, v4.4h, v14.4h
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ smlal v26.4s, v5.4h, v15.4h
+
+ sqrshrun v30.8b, v30.8h,#6 //rounding shift
+ sqshrn v28.4h, v28.4s,#6 //right shift
+
+ ld1 {v1.4h},[x0],x2 //loads pi2_src
+ smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ st1 {v30.s}[0],[x1],#4 //stores the loaded value
+ smlal v24.4s, v4.4h, v13.4h
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smlal v24.4s, v5.4h, v14.4h
+ ld1 {v3.4h},[x0],x2
+ smlal v24.4s, v6.4h, v15.4h
+ add x20,x1,x14
+ csel x1, x20, x1,le
+
+ sqshrn v26.4h, v26.4s,#6 //right shift
+ subs x12,x12,#4
+ sqrshrun v28.8b, v28.8h,#6 //rounding shift
+
+ beq epilog //jumps to epilog
+
+kernel_4:
+ smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ subs x11,x11,#4
+ smlal v30.4s, v1.4h, v13.4h
+ st1 {v28.s}[0],[x9],x3 //stores the loaded value
+ smlal v30.4s, v2.4h, v14.4h
+ smlal v30.4s, v3.4h, v15.4h
+
+ sqshrn v24.4h, v24.4s,#6 //right shift
+ sqrshrun v26.8b, v26.8h,#6 //rounding shift
+
+ ld1 {v4.4h},[x0],x2
+ smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v28.4s, v2.4h, v13.4h
+ smlal v28.4s, v3.4h, v14.4h
+ smlal v28.4s, v4.4h, v15.4h
+ st1 {v26.s}[0],[x9],x3 //stores the loaded value
+ add x20,x4,x8
+ csel x4, x20, x4,le
+ lsl x20,x6,#1
+ csel x11, x20, x11,le
+
+ sqshrn v30.4h, v30.4s,#6 //right shift
+ sqrshrun v24.8b, v24.8h,#6 //rounding shift
+
+ ld1 {v5.4h},[x0],x2
+ smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v6.4h},[x0],x2
+ smlal v26.4s, v3.4h, v13.4h
+ st1 {v24.s}[0],[x9] //stores the loaded value
+ add x0,x4,x2
+ smlal v26.4s, v4.4h, v14.4h
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ smlal v26.4s, v5.4h, v15.4h
+
+ sqshrn v28.4h, v28.4s,#6 //right shift
+ sqrshrun v30.8b, v30.8h,#6 //rounding shift
+
+ ld1 {v1.4h},[x0],x2 //loads pi2_src
+ smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ add x9,x1,x3 //pu1_dst + dst_strd
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smlal v24.4s, v4.4h, v13.4h
+ ld1 {v3.4h},[x0],x2
+ smlal v24.4s, v5.4h, v14.4h
+
+ st1 {v30.s}[0],[x1],#4 //stores the loaded value
+ smlal v24.4s, v6.4h, v15.4h
+
+ sqshrn v26.4h, v26.4s,#6 //right shift
+ sqrshrun v28.8b, v28.8h,#6 //rounding shift
+ add x20,x1,x14
+ csel x1, x20, x1,le
+
+ subs x12,x12,#4
+
+ bgt kernel_4 //jumps to kernel_4
+
+epilog:
+ smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ st1 {v28.s}[0],[x9],x3 //stores the loaded value
+ smlal v30.4s, v1.4h, v13.4h
+ smlal v30.4s, v2.4h, v14.4h
+ smlal v30.4s, v3.4h, v15.4h
+
+ sqshrn v24.4h, v24.4s,#6 //right shift
+ sqrshrun v26.8b, v26.8h,#6 //rounding shift
+
+ smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v4.4h},[x0],x2
+ smlal v28.4s, v2.4h, v13.4h
+ st1 {v26.s}[0],[x9],x3 //stores the loaded value
+ smlal v28.4s, v3.4h, v14.4h
+ smlal v28.4s, v4.4h, v15.4h
+
+ sqshrn v30.4h, v30.4s,#6 //right shift
+ sqrshrun v24.8b, v24.8h,#6 //rounding shift
+
+ smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v5.4h},[x0],x2
+ smlal v26.4s, v3.4h, v13.4h
+ smlal v26.4s, v4.4h, v14.4h
+ smlal v26.4s, v5.4h, v15.4h
+
+ sqshrn v28.4h, v28.4s,#6 //right shift
+ sqrshrun v30.8b, v30.8h,#6 //rounding shift
+
+ st1 {v24.s}[0],[x9] //stores the loaded value
+ smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v24.4s, v4.4h, v13.4h
+ add x9,x1,x3 //pu1_dst + dst_strd
+ ld1 {v6.4h},[x0],x2
+ smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v6.4h, v15.4h
+ st1 {v30.s}[0],[x1],#4 //stores the loaded value
+
+ sqrshrun v28.8b, v28.8h,#6 //rounding shift
+ sqshrn v26.4h, v26.4s,#6 //right shift
+
+ st1 {v28.s}[0],[x9],x3 //stores the loaded value
+ sqrshrun v26.8b, v26.8h,#6 //rounding shift
+
+ sqshrn v24.4h, v24.4s,#6 //right shift
+ st1 {v26.s}[0],[x9],x3 //stores the loaded value
+ sqrshrun v24.8b, v24.8h,#6 //rounding shift
+
+ st1 {v24.s}[0],[x9] //stores the loaded value
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..b6d0eb2
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@@ -0,0 +1,343 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs / parthiban
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* chroma interprediction filter for 16bit vertical input and output.
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' input is 16 bits the filter output is downshifted by 6 and
+//* 8192 is subtracted to store it as a 16 bit number the output is used as
+//* a input to weighted prediction assumptions : the function is optimized
+//* considering the fact width and height are multiple of 2.
+//*
+//* //param[in] pi2_src
+//* word16 pointer to the source
+//*
+//* //param[out] pi2_dst
+//* word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
+// word16 *pi2_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 => src_strd
+//x3 => dst_strd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_av8
+
+.type ihevc_inter_pred_chroma_vert_w16inp_w16out_av8, %function
+
+ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x4, x15 //loads pi1_coeff
+ mov x6, x17 //wd
+ lsl x2,x2,#1 //src_strd = 2* src_strd
+ mov x5,x16 //loads ht
+ ld1 {v0.8b},[x4] //loads pi1_coeff
+ sub x4,x0,x2 //pu1_src - src_strd
+ sxtl v0.8h, v0.8b //long the value
+
+ tst x6,#3 //checks wd == 2
+ dup v12.4h, v0.4h[0] //coeff_0
+ dup v13.4h, v0.4h[1] //coeff_1
+ dup v14.4h, v0.4h[2] //coeff_2
+ dup v15.4h, v0.4h[3] //coeff_3
+
+ bgt core_loop_ht_2 //jumps to loop handles wd 2
+
+ tst x5,#3 //checks ht == mul of 4
+ beq core_loop_ht_4 //jumps to loop handles ht mul of 4
+
+core_loop_ht_2:
+ lsl x7,x2,#1 //2*src_strd
+ lsl x3,x3,#1 //2*dst_strd
+ lsl x9,x6,#2 //4*wd
+ sub x6,x3,x6,lsl #1 //2*dst_strd - 2*wd
+ sub x8,x7,x9 //2*src_strd - 4*wd
+ mov x12,x9 //4wd
+
+inner_loop_ht_2:
+ add x0,x4,x2 //increments pi2_src
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ smull v0.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ subs x12,x12,#8 //2wd + 8
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smull v8.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v3.4h},[x0],x2 //loads pi2_src
+ smlal v0.4s, v2.4h, v13.4h
+ ld1 {v6.4h},[x0],x2
+ smlal v8.4s, v3.4h, v13.4h
+ ld1 {v2.4h},[x0]
+ add x7,x1,x3 //pu1_dst + dst_strd
+ smlal v0.4s, v3.4h, v14.4h
+ smlal v8.4s, v6.4h, v14.4h
+ smlal v0.4s, v6.4h, v15.4h
+ smlal v8.4s, v2.4h, v15.4h
+ sqshrn v0.4h, v0.4s,#6 //right shift
+ sqshrn v30.4h, v8.4s,#6 //right shift
+ st1 {v0.2s},[x1],#8 //stores the loaded value
+ st1 {v30.2s},[x7] //stores the loaded value
+ bgt inner_loop_ht_2 //inner loop -again
+
+ //inner loop ends
+ subs x5,x5,#2 //increments ht
+ add x1,x1,x6,lsl #1 //pu1_dst += 2*dst_strd - 2*wd
+ mov x12,x9 //4wd
+ add x4,x4,x8 //pi1_src_tmp1 += 2*src_strd - 4*wd
+ bgt inner_loop_ht_2 //loop again
+
+ b end_loops //jumps to end
+
+core_loop_ht_4:
+ lsl x7,x2,#2 //2*src_strd
+ lsl x10,x3,#2 //2*dst_strd
+ lsr x11, x6, #1 //divide by 2
+ sub x14,x10,x6,lsl #1 //2*dst_strd - 2*wd
+ sub x8,x7,x6,lsl #2 //2*src_strd - 4*wd
+
+ mul x12, x5 , x11 //multiply height by width
+ sub x12, x12,#4 //subtract by one for epilog
+ lsl x11, x6, #1 //2*wd
+ lsl x3,x3,#1 //2*dst_strd
+
+prolog:
+ add x0,x4,x2 //increments pi2_src
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ ld1 {v1.4h},[x0],x2 //loads pi2_src
+ subs x11,x11,#4
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ ld1 {v3.4h},[x0],x2
+ smlal v30.4s, v1.4h, v13.4h
+ smlal v30.4s, v2.4h, v14.4h
+ add x9,x1,x3 //pu1_dst + dst_strd
+ smlal v30.4s, v3.4h, v15.4h
+
+ ld1 {v4.4h},[x0],x2
+ smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ add x20,x4,x8
+ csel x4, x20, x4,le
+ lsl x20,x6,#1
+ csel x11, x20, x11,le
+ smlal v28.4s, v2.4h, v13.4h
+ smlal v28.4s, v3.4h, v14.4h
+ ld1 {v5.4h},[x0],x2
+ smlal v28.4s, v4.4h, v15.4h
+
+ sqshrn v30.4h, v30.4s,#6 //right shift
+
+ ld1 {v6.4h},[x0],x2
+ smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v26.4s, v3.4h, v13.4h
+ smlal v26.4s, v4.4h, v14.4h
+ add x0,x4,x2
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ smlal v26.4s, v5.4h, v15.4h
+
+ sqshrn v28.4h, v28.4s,#6 //right shift
+
+ ld1 {v1.4h},[x0],x2 //loads pi2_src
+ smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ st1 {v30.2s},[x1],#8 //stores the loaded value
+ smlal v24.4s, v4.4h, v13.4h
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smlal v24.4s, v5.4h, v14.4h
+ ld1 {v3.4h},[x0],x2
+ smlal v24.4s, v6.4h, v15.4h
+ add x20,x1,x14,lsl #1
+ csel x1, x20, x1,le
+
+ sqshrn v26.4h, v26.4s,#6 //right shift
+ subs x12,x12,#4
+
+ beq epilog //jumps to epilog
+
+kernel_4:
+ smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ subs x11,x11,#4
+ smlal v30.4s, v1.4h, v13.4h
+ st1 {v28.2s},[x9],x3 //stores the loaded value
+ smlal v30.4s, v2.4h, v14.4h
+ smlal v30.4s, v3.4h, v15.4h
+
+ sqshrn v24.4h, v24.4s,#6 //right shift
+
+ ld1 {v4.4h},[x0],x2
+ smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v28.4s, v2.4h, v13.4h
+ smlal v28.4s, v3.4h, v14.4h
+ smlal v28.4s, v4.4h, v15.4h
+ st1 {v26.2s},[x9],x3 //stores the loaded value
+ add x20,x4,x8
+ csel x4, x20, x4,le
+ lsl x20,x6,#1
+ csel x11, x20, x11,le
+
+ sqshrn v30.4h, v30.4s,#6 //right shift
+
+ ld1 {v5.4h},[x0],x2
+ smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v6.4h},[x0],x2
+ smlal v26.4s, v3.4h, v13.4h
+ st1 {v24.2s},[x9] //stores the loaded value
+ add x0,x4,x2
+ smlal v26.4s, v4.4h, v14.4h
+ ld1 {v0.4h},[x4],#8 //loads pu1_src
+ smlal v26.4s, v5.4h, v15.4h
+
+ sqshrn v28.4h, v28.4s,#6 //right shift
+
+ ld1 {v1.4h},[x0],x2 //loads pi2_src
+ smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v2.4h},[x0],x2 //loads pi2_src
+ smlal v24.4s, v4.4h, v13.4h
+ add x9,x1,x3 //pu1_dst + dst_strd
+ ld1 {v3.4h},[x0],x2
+ smlal v24.4s, v5.4h, v14.4h
+
+ st1 {v30.2s},[x1],#8 //stores the loaded value
+ smlal v24.4s, v6.4h, v15.4h
+
+ sqshrn v26.4h, v26.4s,#6 //right shift
+ add x20,x1,x14,lsl #1
+ csel x1, x20, x1,le
+
+ subs x12,x12,#4
+
+ bgt kernel_4 //jumps to kernel_4
+
+epilog:
+ smull v30.4s, v0.4h, v12.4h //vmull_s16(src_tmp1, coeff_0)
+ st1 {v28.2s},[x9],x3 //stores the loaded value
+ smlal v30.4s, v1.4h, v13.4h
+ smlal v30.4s, v2.4h, v14.4h
+ smlal v30.4s, v3.4h, v15.4h
+
+ sqshrn v24.4h, v24.4s,#6 //right shift
+
+ smull v28.4s, v1.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v4.4h},[x0],x2
+ smlal v28.4s, v2.4h, v13.4h
+ st1 {v26.2s},[x9],x3 //stores the loaded value
+ smlal v28.4s, v3.4h, v14.4h
+ smlal v28.4s, v4.4h, v15.4h
+
+ sqshrn v30.4h, v30.4s,#6 //right shift
+
+ smull v26.4s, v2.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ ld1 {v5.4h},[x0],x2
+ smlal v26.4s, v3.4h, v13.4h
+ smlal v26.4s, v4.4h, v14.4h
+ smlal v26.4s, v5.4h, v15.4h
+
+ sqshrn v28.4h, v28.4s,#6 //right shift
+
+ st1 {v24.2s},[x9] //stores the loaded value
+ smull v24.4s, v3.4h, v12.4h //vmull_s16(src_tmp2, coeff_0)
+ smlal v24.4s, v4.4h, v13.4h
+ add x9,x1,x3 //pu1_dst + dst_strd
+ ld1 {v6.4h},[x0],x2
+ smlal v24.4s, v5.4h, v14.4h
+ smlal v24.4s, v6.4h, v15.4h
+ st1 {v30.2s},[x1],#8 //stores the loaded value
+
+ sqshrn v26.4h, v26.4s,#6 //right shift
+
+ st1 {v28.2s},[x9],x3 //stores the loaded value
+
+ sqshrn v24.4h, v24.4s,#6 //right shift
+ st1 {v26.2s},[x9],x3 //stores the loaded value
+
+ st1 {v24.2s},[x9] //stores the loaded value
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
new file mode 100644
index 0000000..9f5687f
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
@@ -0,0 +1,392 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_inter_pred_chroma_vert_w16out_neon.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs/ pathiban
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* interprediction chroma filter to store vertical 16bit ouput
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' no downshifting or clipping is done and the output is used as
+//* an input for weighted prediction assumptions : the function is optimized
+//* considering the fact width is multiple of 2,4 or 8. and also considering
+//* height should be multiple of 2. width 4,8 is optimized further
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pi2_dst
+//* word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*****************************************************************************
+//*/
+//void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src,
+// word16 *pi2_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd)
+//**************variables vs registers*****************************************
+//x0 => *pu1_src
+//x1 => *pi2_dst
+//x2 => src_strd
+//x3 => dst_strd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_chroma_vert_w16out_av8
+
+.type ihevc_inter_pred_chroma_vert_w16out_av8, %function
+
+ihevc_inter_pred_chroma_vert_w16out_av8:
+
+ // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+
+ mov x4,x16 //loads ht
+ mov x12,x15 //loads pi1_coeff
+ cmp x4,#0 //checks ht == 0
+ mov x6,x17 //loads wd
+ sub x0,x0,x2 //pu1_src - src_strd
+ ld1 {v0.8b},[x12] //loads pi1_coeff
+
+ ble end_loops //jumps to end
+
+ tst x6,#3 //checks (wd & 3)
+ abs v3.8b, v0.8b //vabs_s8(coeff)
+ lsl x10,x6,#1 //2*wd
+ dup v0.8b, v3.8b[0] //coeffabs_0
+ dup v1.8b, v3.8b[1] //coeffabs_1
+ dup v2.8b, v3.8b[2] //coeffabs_2
+ dup v3.8b, v3.8b[3] //coeffabs_3
+
+ bgt outer_loop_wd_2 //jumps to loop handling wd ==2
+
+ tst x4,#7 //checks ht for mul of 8
+ beq core_loop_ht_8 //when height is multiple of 8
+
+ lsl x7,x3,#2 //2*dst_strd
+ sub x9,x7,x10,lsl #1 //4*dst_strd - 4wd
+ lsl x12,x2,#1 //2*src_strd
+ sub x8,x12,x10 //2*src_strd - 2wd
+ lsl x3, x3, #1
+ mov x5,x10 //2wd
+
+inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2
+
+ add x6,x0,x2 //pu1_src +src_strd
+ ld1 {v9.8b},[x6],x2 //loads pu1_src
+ subs x5,x5,#8 //2wd - 8
+ ld1 {v5.8b},[x0],#8 //loads src
+ umull v6.8h, v9.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ ld1 {v4.8b},[x6],x2 //loads incremented src
+ umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
+ ld1 {v8.8b},[x6],x2 //loads incremented src
+ umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
+ umull v4.8h, v4.8b, v1.8b
+ ld1 {v10.8b},[x6] //loads the incremented src
+ umlsl v6.8h, v8.8b, v3.8b
+ umlsl v4.8h, v9.8b, v0.8b
+ umlal v4.8h, v8.8b, v2.8b
+ umlsl v4.8h, v10.8b, v3.8b
+ add x6,x1,x3 //pu1_dst + dst_strd
+ st1 { v6.8h},[x1],#16 //stores the loaded value
+
+ st1 { v4.8h},[x6] //stores the loaded value
+
+ bgt inner_loop_ht_2 //inner loop again
+
+ subs x4,x4,#2 //ht - 2
+ add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd)
+ mov x5,x10 //2wd
+ add x0,x0,x8 //pu1_src += (2*src_strd - 2wd)
+
+ bgt inner_loop_ht_2 //loop again
+
+ b end_loops //jumps to end
+
+outer_loop_wd_2: //called when width is multiple of 2
+ lsl x5,x3,#2 //2*dst_strd
+ mov x12,x10 //2wd
+ sub x9,x5,x10,lsl #1 //4*dst_strd - 4wd
+ lsl x7,x2,#1 //2*src_strd
+ sub x8,x7,x10 //2*src_strd - 2wd
+
+inner_loop_wd_2:
+
+ add x6,x0,x2 //pu1_src + src_strd
+ ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
+ subs x12,x12,#4 //2wd - 4
+ add x0,x0,#4 //pu1_src + 4
+ ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp
+ dup v7.2s, v6.2s[1]
+ ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp
+ umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
+ dup v7.2s, v7.2s[1]
+ ld1 {v7.s}[1],[x6],x2
+ umlsl v4.8h, v6.8b, v0.8b
+ umlal v4.8h, v7.8b, v2.8b
+ dup v7.2s, v7.2s[1]
+ ld1 {v7.s}[1],[x6]
+ add x6,x1,x3,lsl #1 //pu1_dst + dst_strd
+ umlsl v4.8h, v7.8b, v3.8b
+ st1 {v4.d}[0],[x1] //stores the loaded value
+ add x1,x1,#8 //pu1_dst += 4
+ st1 {v4.d}[1],[x6] //stores the loaded value
+
+ bgt inner_loop_wd_2 //inner loop again
+
+ //inner loop ends
+ subs x4,x4,#2 //ht - 2
+ add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd
+ mov x12,x10 //2wd
+ add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd
+
+ bgt inner_loop_wd_2 //loop again
+
+ b end_loops //jumps to end
+
+core_loop_ht_8: //when wd & ht is multiple of 8
+
+ lsl x12,x3,#3 //4*dst_strd
+ sub x8,x12,x10,lsl #1 //4*dst_strd - 2wd
+ lsl x12,x2,#2 //4*src_strd
+ sub x9,x12,x10 //4*src_strd - 2wd
+
+ bic x5,x10,#7 //x5 ->wd
+ lsr x14, x10, #3 //divide by 8
+ mul x12, x4 , x14 //multiply height by width
+ sub x12, x12,#4 //subtract by one for epilog
+ lsl x3, x3, #1
+
+prolog:
+ add x6,x0,x2 //pu1_src + src_strd
+ ld1 {v5.8b},[x6],x2 //loads pu1_src
+ subs x5,x5,#8 //2wd - 8
+ ld1 {v4.8b},[x0],#8 //loads the source
+ ld1 {v6.8b},[x6],x2 //load and increment
+ umull v30.8h, v5.8b, v1.8b //mul with coeff 1
+ ld1 {v7.8b},[x6],x2 //load and increment
+ umlsl v30.8h, v4.8b, v0.8b
+ add x7,x1,x3 //pu1_dst
+ umlal v30.8h, v6.8b, v2.8b
+ umlsl v30.8h, v7.8b, v3.8b
+ ld1 {v8.8b},[x6],x2 //load and increment
+
+ umull v28.8h, v6.8b, v1.8b //mul_res 2
+ add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
+ csel x0, x20, x0,le
+ umlsl v28.8h, v5.8b, v0.8b
+ bic x20,x10,#7 //x5 ->wd
+ csel x5, x20, x5,le
+ umlal v28.8h, v7.8b, v2.8b
+ ld1 {v9.8b},[x6],x2
+ umlsl v28.8h, v8.8b, v3.8b
+
+ ld1 {v10.8b},[x6],x2
+ umull v26.8h, v7.8b, v1.8b
+ add x6,x0,x2 //pu1_src + src_strd
+ umlsl v26.8h, v6.8b, v0.8b
+ st1 { v30.16b},[x1],#16 //stores the loaded value
+ umlal v26.8h, v8.8b, v2.8b
+ ld1 {v4.8b},[x0],#8 //loads the source
+ umlsl v26.8h, v9.8b, v3.8b
+
+ add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
+ csel x1, x20, x1,le
+ umull v24.8h, v8.8b, v1.8b
+ ld1 {v5.8b},[x6],x2 //loads pu1_src
+ umlsl v24.8h, v7.8b, v0.8b
+ subs x12,x12,#4
+ ld1 {v6.8b},[x6],x2 //load and increment
+ umlal v24.8h, v9.8b, v2.8b
+ ld1 {v7.8b},[x6],x2 //load and increment
+ umlsl v24.8h, v10.8b, v3.8b
+ sub x20,x2,x2,lsl #3
+ neg x11, x20
+ add x14,x2,x2,lsl #1
+ add x14,x14,x11
+ st1 { v28.16b},[x7],x3 //stores the loaded value
+
+ ble epilog //jumps to epilog
+
+kernel_8:
+
+ umull v30.8h, v5.8b, v1.8b //mul with coeff 1
+ subs x5,x5,#8 //2wd - 8
+ umlsl v30.8h, v4.8b, v0.8b
+ add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
+ csel x0, x20, x0,le
+ umlal v30.8h, v6.8b, v2.8b
+
+ lsl x20,x2,#3
+ sub x20,x20,x2
+ csel x11,x20,x11,le
+ //rsble x11,x2,x2,lsl #3
+ umlsl v30.8h, v7.8b, v3.8b
+ st1 { v26.16b},[x7],x3 //stores the loaded value
+
+ ld1 {v8.8b},[x6],x2 //load and increment
+
+ umull v28.8h, v6.8b, v1.8b //mul_res 2
+ bic x20,x10,#7 //x5 ->wd
+ csel x5, x20, x5,le
+ umlsl v28.8h, v5.8b, v0.8b
+ st1 { v24.16b},[x7],x3 //stores the loaded value
+
+ umlal v28.8h, v7.8b, v2.8b
+ ld1 {v9.8b},[x6],x2
+
+ umlsl v28.8h, v8.8b, v3.8b
+ ld1 {v10.8b},[x6],x2
+ add x7,x1,x3 //pu1_dst
+ umull v26.8h, v7.8b, v1.8b
+ add x6,x0,x2 //pu1_src + src_strd
+ add x20,x0, x11
+ prfm PLDL1KEEP,[x20]
+
+ umlsl v26.8h, v6.8b, v0.8b
+ ld1 {v4.8b},[x0],#8 //loads the source
+
+ add x11,x11,x2
+ umlal v26.8h, v8.8b, v2.8b
+ st1 { v30.16b},[x1],#16 //stores the loaded value
+
+ umlsl v26.8h, v9.8b, v3.8b
+ ld1 {v5.8b},[x6],x2 //loads pu1_src
+
+ umull v24.8h, v8.8b, v1.8b
+ ld1 {v6.8b},[x6],x2 //load and increment
+ add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
+ csel x1, x20, x1,le
+
+ cmp x11,x14
+
+ lsl x20,x2,#3
+ sub x20,x20,x2
+ csel x11,x20,x11,gt
+ //rsbgt x11,x2,x2,lsl #3
+
+ umlsl v24.8h, v7.8b, v0.8b
+ subs x12,x12,#4
+
+
+ umlal v24.8h, v9.8b, v2.8b
+ ld1 {v7.8b},[x6],x2 //load and increment
+
+ umlsl v24.8h, v10.8b, v3.8b
+ st1 { v28.16b},[x7],x3 //stores the loaded value
+
+ bgt kernel_8 //jumps to kernel_8
+
+epilog:
+
+ umull v30.8h, v5.8b, v1.8b //mul with coeff 1
+ umlsl v30.8h, v4.8b, v0.8b
+ umlal v30.8h, v6.8b, v2.8b
+ umlsl v30.8h, v7.8b, v3.8b
+ st1 { v26.16b},[x7],x3 //stores the loaded value
+
+ ld1 {v8.8b},[x6],x2 //load and increment
+ umull v28.8h, v6.8b, v1.8b //mul_res 2
+ umlsl v28.8h, v5.8b, v0.8b
+ umlal v28.8h, v7.8b, v2.8b
+ umlsl v28.8h, v8.8b, v3.8b
+ st1 { v24.16b},[x7],x3 //stores the loaded value
+
+ ld1 {v9.8b},[x6],x2
+ umull v26.8h, v7.8b, v1.8b
+ add x7,x1,x3 //pu1_dst
+ umlsl v26.8h, v6.8b, v0.8b
+ st1 { v30.16b},[x1],#16 //stores the loaded value
+ umlal v26.8h, v8.8b, v2.8b
+ ld1 {v10.8b},[x6],x2
+ umlsl v26.8h, v9.8b, v3.8b
+
+ umull v24.8h, v8.8b, v1.8b
+ st1 { v28.16b},[x7],x3 //stores the loaded value
+ umlsl v24.8h, v7.8b, v0.8b
+ umlal v24.8h, v9.8b, v2.8b
+ st1 { v26.16b},[x7],x3 //stores the loaded value
+ umlsl v24.8h, v10.8b, v3.8b
+
+ st1 { v24.16b},[x7],x3 //stores the loaded value
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_horz.s b/common/arm64/ihevc_inter_pred_filters_luma_horz.s
new file mode 100644
index 0000000..1e246da
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_horz.s
@@ -0,0 +1,605 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//* ihevc_inter_pred_luma_horz.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* parthiban v
+//*
+//* //par list of functions:
+//*
+//* - ihevc_inter_pred_luma_horz()
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+//
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* interprediction luma filter for vertical input
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
+//* assumptions : the function is optimized considering the fact width is
+//* multiple of 4 or 8. and height as multiple of 2.
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_horz (
+// uword8 *pu1_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd )
+
+//**************variables vs registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => *pi1_coeff
+// x5 => ht
+// x6 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_horz_av8
+
+.type ihevc_inter_pred_luma_horz_av8, %function
+
+ihevc_inter_pred_luma_horz_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ //str x1,[sp,#-4]
+ // mov x7,#8192
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+start_loop_count:
+ // ldr x1,[sp,#-4]
+
+
+ mov x4,x15 //loads pi1_coeff
+ mov x8,x16 //loads ht
+ mov x10,x17 //loads wd
+
+ ld1 {v0.8b},[x4] //coeff = vld1_s8(pi1_coeff)
+ mov x11,#1
+ subs x14,x8,#0 //checks for ht == 0
+
+ abs v2.8b, v0.8b //vabs_s8(coeff)
+
+ //ble end_loops
+
+
+ dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub x12,x0,#3 //pu1_src - 3
+ dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add x4,x12,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
+ dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ sub x20,x10,x2,lsl #1 //2*src_strd - wd
+ neg x9, x20
+ dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ sub x20,x10,x3,lsl #1 //2*dst_strd - wd
+ neg x8, x20
+ dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+ dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+ // tst x10,#7 //checks wd for multiples
+ dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+ dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+ mov x7,x1
+
+ cmp x10,#4
+ ble outer_loop_4
+
+ cmp x10,#24
+ mov x20,#16
+ csel x10, x20, x10,eq
+ add x20, x8,#8
+ csel x8, x20, x8,eq
+ add x20, x9,#8
+ csel x9, x20, x9,eq
+
+ cmp x10,#16
+ bge outer_loop_16
+
+ cmp x10,#12
+ add x20, x8,#4
+ csel x8, x20, x8,eq
+ add x20, x9,#4
+ csel x9, x20, x9,eq
+ b outer_loop_8
+
+
+outer_loop8_residual:
+ sub x12,x0,#3 //pu1_src - 3
+ mov x1,x7
+ mov x14,#32
+ add x1, x1,#16
+ add x12, x12,#16
+ mov x10,#8
+ add x8, x8,#8
+ add x9, x9,#8
+
+outer_loop_8:
+
+ add x6,x1,x3 //pu1_dst + dst_strd
+ add x4,x12,x2 //pu1_src + src_strd
+ subs x5,x10,#0 //checks wd
+
+ ble end_inner_loop_8
+
+inner_loop_8:
+ ld1 {v0.2s},[x12],x11 //vector load pu1_src
+ ld1 {v1.2s},[x12],x11
+ ld1 {v2.2s},[x12],x11
+ ld1 {v3.2s},[x12],x11
+
+
+
+
+
+ // vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ // vext.u8 d3,d0,d1,#3 //vector extract of src[0_3]
+ // vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+ // vext.u8 d5,d0,d1,#5 //vector extract of src[0_5]
+ // vext.u8 d6,d0,d1,#6 //vector extract of src [0_6]
+ // vext.u8 d7,d0,d1,#7 //vector extract of src[0_7]
+ // vext.u8 d1,d0,d1,#1 //vector extract of src[0_1]
+ // vext.u8 d14,d12,d13,#2
+
+ //vext.u8 d15,d12,d13,#3 //vector extract of src[0_3]
+ // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5]
+ //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+ //vext.u8 d19,d12,d13,#7 //vector extract of src[0_7]
+ //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1]
+ ld1 {v4.2s},[x12],x11
+ umull v8.8h, v1.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 {v5.2s},[x12],x11
+ umlal v8.8h, v3.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v6.2s},[x12],x11
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ ld1 {v7.2s},[x12],x11
+ umlsl v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ ld1 {v12.2s},[x4],x11 //vector load pu1_src + src_strd
+ umlal v8.8h, v4.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ ld1 {v13.2s},[x4],x11
+ umlsl v8.8h, v5.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ ld1 {v14.2s},[x4],x11
+ umlal v8.8h, v6.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ ld1 {v15.2s},[x4],x11
+ umlsl v8.8h, v7.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+ ld1 {v16.2s},[x4],x11 //vector load pu1_src + src_strd
+
+ umull v10.8h, v15.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v17.2s},[x4],x11
+ umlsl v10.8h, v14.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ ld1 {v18.2s},[x4],x11
+ umlal v10.8h, v16.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ ld1 {v19.2s},[x4],x11 //vector load pu1_src + src_strd
+ umlsl v10.8h, v17.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ sqrshrun v20.8b, v8.8h,#6 //right shift and saturating narrow result 1
+ umlal v10.8h, v18.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ umlsl v10.8h, v19.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+ st1 {v20.8b},[x1],#8 //store the result pu1_dst
+ umlsl v10.8h, v12.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlal v10.8h, v13.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+
+ sqrshrun v8.8b, v10.8h,#6 //right shift and saturating narrow result 2
+ subs x5,x5,#8 //decrement the wd loop
+ st1 {v8.8b},[x6],#8 //store the result pu1_dst
+ cmp x5,#4
+ bgt inner_loop_8
+
+end_inner_loop_8:
+ subs x14,x14,#2 //decrement the ht loop
+ add x12,x12,x9 //increment the src pointer by 2*src_strd-wd
+ add x1,x1,x8 //increment the dst pointer by 2*dst_strd-wd
+ bgt outer_loop_8
+
+
+
+
+
+ mov x10,x17 //loads wd
+ cmp x10,#12
+
+ beq outer_loop4_residual
+
+
+end_loops:
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+outer_loop_16:
+ mov x15, #-7
+ stp x0,x7, [sp, #-16]!
+
+ add x6,x1,x3 //pu1_dst + dst_strd
+ add x4,x12,x2 //pu1_src + src_strd
+ and x0, x12, #31
+ sub x5,x10,#0 //checks wd
+ //ble end_loops1
+ add x20,x12, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ ld1 { v0.2s},[x12],#8 //vector load pu1_src
+ ld1 { v1.2s},[x12],x15 //vector load pu1_src
+ add x20,x4, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ ld1 { v2.2s},[x12],#8
+ ld1 { v3.2s},[x12],x15
+ ld1 { v4.2s},[x12],#8
+ ld1 { v5.2s},[x12],x15
+ ld1 { v6.2s},[x12],#8
+ ld1 { v7.2s},[x12],x15
+ ld1 { v12.2s},[x12],#8
+ ld1 { v13.2s},[x12],x15
+ umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 { v14.2s},[x12],#8
+ ld1 { v15.2s},[x12],x15
+ umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 { v16.2s},[x12],#8
+ ld1 { v17.2s},[x12],x15
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ ld1 { v18.2s},[x12],#8
+ ld1 { v19.2s},[x12],x15
+ umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+
+inner_loop_16:
+
+
+ subs x5,x5,#16
+ umull v20.8h, v3.8b, v25.8b
+
+ add x12, x12,#8
+ umlsl v20.8h, v1.8b, v24.8b
+
+ sub x20,x14,#2
+ csel x14, x20, x14,eq
+ umlal v20.8h, v7.8b, v27.8b
+
+ ld1 { v0.2s},[x4],#8 //vector load pu1_src
+ ld1 { v1.2s},[x4],x15 //vector load pu1_src
+
+ umlsl v20.8h, v5.8b, v26.8b
+
+ ld1 { v2.2s},[x4],#8
+ ld1 { v3.2s},[x4],x15
+
+ umlal v20.8h, v13.8b, v28.8b
+
+ ld1 { v4.2s},[x4],#8
+ ld1 { v5.2s},[x4],x15
+ umlal v20.8h, v17.8b, v30.8b
+
+ ld1 { v6.2s},[x4],#8
+ ld1 { v7.2s},[x4],x15
+ umlsl v20.8h, v15.8b, v29.8b
+
+ ld1 { v12.2s},[x4],#8
+ ld1 { v13.2s},[x4],x15
+ umlsl v20.8h, v19.8b, v31.8b
+
+ ld1 { v14.2s},[x4],#8
+ ld1 { v15.2s},[x4],x15
+ sqrshrun v8.8b, v8.8h,#6 //right shift and saturating narrow result 1
+
+ ld1 { v16.2s},[x4],#8
+ ld1 { v17.2s},[x4],x15
+ umull v10.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 { v18.2s},[x4],#8
+ ld1 { v19.2s},[x4],x15
+ umlal v10.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ add x4, x4,#8
+ umlsl v10.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+
+ add x20,x12,x9 //increment the src pointer by 2*src_strd-wd
+ csel x12, x20, x12,eq
+ umlsl v10.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ add x20,x12,x2 //pu1_src + src_strd
+ csel x4, x20, x4,eq
+ sqrshrun v9.8b, v20.8h,#6
+
+ umlal v10.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+
+// and x7, x12, #31
+ umlsl v10.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+
+ umlal v10.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+
+ umlsl v10.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+ umull v22.8h, v3.8b, v25.8b
+
+ umlsl v22.8h, v1.8b, v24.8b
+
+ st1 { v8.8b},[x1],#8 //store the result pu1_dst
+ st1 { v9.8b},[x1],#8 //store the result pu1_dst
+ umlal v22.8h, v7.8b, v27.8b
+
+ add x20,x1,x8
+ csel x1, x20, x1,eq
+ sqrshrun v10.8b, v10.8h,#6 //right shift and saturating narrow result 2
+
+// cmp x7, x0
+ umlsl v22.8h, v5.8b, v26.8b
+
+ add x20,x12, x2, lsl #2
+ prfm PLDL1KEEP,[x20]
+ umlal v22.8h, v13.8b, v28.8b
+
+ add x20,x4, x2, lsl #2
+ prfm PLDL1KEEP,[x20]
+ umlal v22.8h, v17.8b, v30.8b
+
+// mov x0, x7
+ umlsl v22.8h, v15.8b, v29.8b
+
+ cmp x14,#0
+ umlsl v22.8h, v19.8b, v31.8b
+
+ beq epilog_16
+ ld1 { v0.2s},[x12],#8 //vector load pu1_src
+ ld1 { v1.2s},[x12],x15 //vector load pu1_src
+ ld1 { v2.2s},[x12],#8
+ ld1 { v3.2s},[x12],x15
+ ld1 { v4.2s},[x12],#8
+ ld1 { v5.2s},[x12],x15
+ ld1 { v6.2s},[x12],#8
+ ld1 { v7.2s},[x12],x15
+ ld1 { v12.2s},[x12],#8
+ ld1 { v13.2s},[x12],x15
+ sqrshrun v11.8b, v22.8h,#6
+ umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 { v14.2s},[x12],#8
+ ld1 { v15.2s},[x12],x15
+ umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 { v16.2s},[x12],#8
+ ld1 { v17.2s},[x12],x15
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ ld1 { v18.2s},[x12],#8
+ ld1 { v19.2s},[x12],x15
+ umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ cmp x5,#0
+ umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ csel x5, x10, x5,eq
+ umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ st1 { v10.8b},[x6],#8 //store the result pu1_dst
+ st1 { v11.8b},[x6],#8 //store the result pu1_dst
+ umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+ add x20,x1,x3 //pu1_dst + dst_strd
+ csel x6, x20, x6,eq
+ b inner_loop_16
+
+
+epilog_16:
+ sqrshrun v11.8b, v22.8h,#6
+ st1 { v10.8b},[x6],#8 //store the result pu1_dst
+ st1 { v11.8b},[x6],#8 //store the result pu1_dst
+
+ ldp x0,x7, [sp], #16
+ mov x10,x17
+ cmp x10,#24
+
+ beq outer_loop8_residual
+
+
+
+end_loops1:
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+outer_loop4_residual:
+ sub x12,x0,#3 //pu1_src - 3
+ mov x1,x7
+ add x1, x1,#8
+ mov x10,#4
+ add x12, x12,#8
+ mov x14,#16
+ add x8, x8,#4
+ add x9, x9,#4
+
+outer_loop_4:
+ add x6,x1,x3 //pu1_dst + dst_strd
+ add x4,x12,x2 //pu1_src + src_strd
+
+ subs x5,x10,#0 //checks wd
+ ble end_inner_loop_4
+
+inner_loop_4:
+ ld1 {v20.2s},[x12],x11 //vector load pu1_src
+ ld1 {v21.2s},[x12],x11
+ ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd
+ ld1 {v23.2s},[x4],x11
+
+ zip1 v0.2s, v20.2s, v22.2s
+ zip2 v12.2s, v20.2s, v22.2s //vector zip the i iteration and ii interation in single register
+ zip1 v1.2s, v21.2s, v23.2s
+ zip2 v13.2s, v21.2s, v23.2s
+
+ ld1 {v20.2s},[x12],x11 //vector load pu1_src
+ ld1 {v21.2s},[x12],x11
+ ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd
+ ld1 {v23.2s},[x4],x11
+
+ zip1 v2.2s, v20.2s, v22.2s
+ zip2 v14.2s, v20.2s, v22.2s
+ zip1 v3.2s, v21.2s, v23.2s
+ zip2 v15.2s, v21.2s, v23.2s
+
+ ld1 {v20.2s},[x12],x11 //vector load pu1_src
+ ld1 {v21.2s},[x12],x11
+ ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd
+ ld1 {v23.2s},[x4],x11
+
+ zip1 v4.2s, v20.2s, v22.2s
+ zip2 v16.2s, v20.2s, v22.2s
+ zip1 v5.2s, v21.2s, v23.2s
+ zip2 v17.2s, v21.2s, v23.2s
+
+ ld1 {v20.2s},[x12],x11 //vector load pu1_src
+ ld1 {v21.2s},[x12],x11
+ ld1 {v22.2s},[x4],x11 //vector load pu1_src + src_strd
+ ld1 {v23.2s},[x4],x11
+
+ zip1 v6.2s, v20.2s, v22.2s
+ zip2 v18.2s, v20.2s, v22.2s
+ zip1 v7.2s, v21.2s, v23.2s
+ zip2 v19.2s, v21.2s, v23.2s
+
+ //add x12,x12,#4 //increment the input pointer
+ sub x12,x12,#4
+ //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ //vext.u8 d3,d0,d1,#3 //vector extract of src[0_3]
+ //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+
+ //vext.u8 d5,d0,d1,#5 //vector extract of src[0_5]
+ //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+ //vext.u8 d7,d0,d1,#7 //vector extract of src[0_7]
+ //vext.u8 d1,d0,d1,#1 //vector extract of src[0_1]
+
+ sub x4,x4,#4
+ // add x4,x4,#4 //increment the input pointer
+ // vext.u8 d14,d12,d13,#2 //vector extract of src[0_2]
+ // vext.u8 d15,d12,d13,#3 //vector extract of src[0_3]
+ // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5]
+ // vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+ // vext.u8 d19,d12,d13,#7 //vector extract of src[0_7]
+ //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1]
+
+ umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
+ umlsl v8.8h, v0.8b, v24.8b
+ umlsl v8.8h, v2.8b, v26.8b
+ umlal v8.8h, v3.8b, v27.8b
+ umlal v8.8h, v4.8b, v28.8b
+ umlsl v8.8h, v5.8b, v29.8b
+ umlal v8.8h, v6.8b, v30.8b
+ umlsl v8.8h, v7.8b, v31.8b
+
+ sqrshrun v8.8b, v8.8h,#6 //narrow right shift and saturating the result
+ st1 {v8.s}[0],[x1],#4 //store the i iteration result which is in upper part of the register
+ st1 {v8.s}[1],[x6],#4 //store the ii iteration result which is in lower part of the register
+ subs x5,x5,#4 //decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4:
+ subs x14,x14,#2 //decrement the ht by 4
+ add x12,x12,x9 //increment the input pointer 2*src_strd-wd
+ add x1,x1,x8 //increment the output pointer 2*dst_strd-wd
+ bgt outer_loop_4
+ //subs x7,x7,#1
+ // bgt start_loop_count
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert.s b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
new file mode 100644
index 0000000..48dc30f
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
@@ -0,0 +1,522 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//* ihevc_inter_pred_filters_luma_vert.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* parthiban v
+//*
+//* //par list of functions:
+//*
+//* - ihevc_inter_pred_luma_vert()
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* interprediction luma filter for vertical input
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
+//* assumptions : the function is optimized considering the fact width is
+//* multiple of 4 or 8. and height as multiple of 2.
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_vert (
+// uword8 *pu1_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd )
+
+//**************variables vs registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x6 => dst_strd
+// x12 => *pi1_coeff
+// x5 => ht
+// x3 => wd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_av8
+
+.type ihevc_inter_pred_luma_vert_av8, %function
+
+ihevc_inter_pred_luma_vert_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x12,x15 //load pi1_coeff
+ mov x6,x3
+ mov x5,x17 //load wd
+ ld1 {v0.8b},[x12] //coeff = vld1_s8(pi1_coeff)
+ sub x12,x2,x2,lsl #2 //src_ctrd & pi1_coeff
+ abs v0.8b, v0.8b //vabs_s8(coeff)
+ add x0,x0,x12 //x0->pu1_src x12->pi1_coeff
+ mov x3,x16 //load ht
+ subs x7,x3,#0 //x3->ht
+ //ble end_loops //end loop jump
+ dup v22.8b, v0.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+ cmp x5,#8
+ dup v23.8b, v0.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+ dup v24.8b, v0.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+ dup v25.8b, v0.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+ dup v26.8b, v0.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+ dup v27.8b, v0.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+ dup v28.8b, v0.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+ dup v29.8b, v0.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+ blt core_loop_wd_4 //core loop wd 4 jump
+ stp x0,x1, [sp, #-16]!
+
+ bic x4,x5,#7 //x5 ->wd
+ sub x20,x4,x6,lsl #2 //x6->dst_strd x5 ->wd
+ neg x9, x20
+ sub x20,x4,x2,lsl #2 //x2->src_strd
+ neg x8, x20
+ lsr x3, x5, #3 //divide by 8
+ mul x7, x7, x3 //multiply height by width
+ sub x7, x7,#4 //subtract by one for epilog
+
+prolog:
+
+ and x10, x0, #31
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ subs x4,x4,#8
+ ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+
+ ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+ add x20,x0,x8
+ csel x0, x20, x0,le
+ umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+ bic x20,x5,#7 //x5 ->wd
+ csel x4, x20, x4,le
+ umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+ prfm PLDL1KEEP,[x3]
+ umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ add x20,x3, x2
+ prfm PLDL1KEEP,[x20]
+ umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ add x20,x3, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+
+ add x3, x3, x2
+ umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+
+ add x20,x3, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v1.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umull v12.8h, v3.8b, v23.8b
+ ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v12.8h, v2.8b, v22.8b
+ ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v12.8h, v4.8b, v24.8b
+ umlal v12.8h, v5.8b, v25.8b
+ umlal v12.8h, v6.8b, v26.8b
+ umlsl v12.8h, v7.8b, v27.8b
+ umlal v12.8h, v16.8b, v28.8b
+ umlsl v12.8h, v17.8b, v29.8b
+ add x14,x1,x6
+ st1 {v8.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)//
+ sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ add x20,x1,x9
+ csel x1, x20, x1,le
+
+ umull v14.8h, v4.8b, v23.8b
+ subs x7,x7,#4
+ umlsl v14.8h, v3.8b, v22.8b
+ umlsl v14.8h, v5.8b, v24.8b
+ umlal v14.8h, v6.8b, v25.8b
+ ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v7.8b, v26.8b
+ ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v16.8b, v27.8b
+ ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v17.8b, v28.8b
+ ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v18.8b, v29.8b
+ ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+ st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ sqrshrun v12.8b, v12.8h,#6
+
+
+ blt epilog_end //jumps to epilog_end
+ beq epilog //jumps to epilog
+
+kernel_8:
+
+ subs x4,x4,#8
+ umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+
+ add x20,x0,x8
+ csel x0, x20, x0,le
+ umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+
+ bic x20,x5,#7 //x5 ->wd
+ csel x4, x20, x4,le
+ umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+
+ ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+
+ ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+
+ ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+
+ umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+
+ umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v12.8b},[x14],x6
+
+// and x11, x0, #31
+ sqrshrun v14.8b, v14.8h,#6
+
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+ ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+ umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+ ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+
+ st1 {v14.8b},[x14],x6
+ umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+
+ add x14,x1,#0
+ umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+
+ add x1, x1, #8
+ umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+
+ umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+ add x20,x1,x9
+ csel x1, x20, x1,le
+ sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+// cmp x11, x10
+ umull v12.8h, v3.8b, v23.8b
+
+ add x10, x3, x2, lsl #3 // 10*strd - 8+2
+ umlsl v12.8h, v2.8b, v22.8b
+
+ add x10, x10, x2 // 11*strd
+ umlsl v12.8h, v4.8b, v24.8b
+
+ ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlal v12.8h, v5.8b, v25.8b
+
+ umlal v12.8h, v6.8b, v26.8b
+ st1 {v8.8b},[x14],x6 //vst1_u8(pu1_dst,sto_res)//
+
+ prfm PLDL1KEEP,[x10] //11+ 0
+ umlsl v12.8h, v7.8b, v27.8b
+
+ add x20,x10, x2
+ prfm PLDL1KEEP,[x20] //11+ 1*strd
+ umlal v12.8h, v16.8b, v28.8b
+
+ add x20,x10, x2, lsl #1
+ prfm PLDL1KEEP,[x20] //11+ 2*strd
+ umlsl v12.8h, v17.8b, v29.8b
+
+ add x10, x10, x2 //12*strd
+ sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ add x20,x10, x2, lsl #1
+ prfm PLDL1KEEP,[x20] //11+ 3*strd
+ umull v14.8h, v4.8b, v23.8b
+
+// mov x10, x11
+ umlsl v14.8h, v3.8b, v22.8b
+
+ subs x7,x7,#4
+ umlsl v14.8h, v5.8b, v24.8b
+
+ umlal v14.8h, v6.8b, v25.8b
+ ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v7.8b, v26.8b
+ ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v16.8b, v27.8b
+ ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v17.8b, v28.8b
+ ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v18.8b, v29.8b
+ ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+ sqrshrun v12.8b, v12.8h,#6
+ st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+
+
+
+ bgt kernel_8 //jumps to kernel_8
+
+epilog:
+
+ umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v12.8b},[x14],x6
+
+ sqrshrun v14.8b, v14.8h,#6
+
+ ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v14.8b},[x14],x6
+
+ sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umull v12.8h, v3.8b, v23.8b
+ umlsl v12.8h, v2.8b, v22.8b
+ umlsl v12.8h, v4.8b, v24.8b
+ umlal v12.8h, v5.8b, v25.8b
+ umlal v12.8h, v6.8b, v26.8b
+ umlsl v12.8h, v7.8b, v27.8b
+ umlal v12.8h, v16.8b, v28.8b
+ umlsl v12.8h, v17.8b, v29.8b
+ add x14,x1,x6
+ st1 {v8.8b},[x1],#8 //vst1_u8(pu1_dst,sto_res)//
+ sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umull v14.8h, v4.8b, v23.8b
+ umlsl v14.8h, v3.8b, v22.8b
+ umlsl v14.8h, v5.8b, v24.8b
+ umlal v14.8h, v6.8b, v25.8b
+ umlal v14.8h, v7.8b, v26.8b
+ umlsl v14.8h, v16.8b, v27.8b
+ umlal v14.8h, v17.8b, v28.8b
+ umlsl v14.8h, v18.8b, v29.8b
+
+ st1 {v10.8b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ sqrshrun v12.8b, v12.8h,#6
+
+epilog_end:
+ st1 {v12.8b},[x14],x6
+ sqrshrun v14.8b, v14.8h,#6
+
+ st1 {v14.8b},[x14],x6
+
+
+end_loops:
+ tst x5,#7
+ ldp x0,x1, [sp],#16
+
+ // ldmeqfd sp!,{x4-x12,x15} //reload the registers from sp
+ bne lbl409
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+lbl409:
+ mov x5, #4
+ add x0, x0, #8
+ add x1, x1, #8
+ mov x7, #16
+ //
+
+core_loop_wd_4:
+ sub x20,x5,x6,lsl #2 //x6->dst_strd x5 ->wd
+ neg x9, x20
+ sub x20,x5,x2,lsl #2 //x2->src_strd
+ neg x8, x20
+ movi v4.8b, #0
+
+outer_loop_wd_4:
+ subs x12,x5,#0
+ ble end_inner_loop_wd_4 //outer loop jump
+
+inner_loop_wd_4:
+ add x3,x0,x2
+ ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+ subs x12,x12,#4
+ dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+ ld1 {v4.s}[0],[x0] //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)//
+ umull v0.8h, v5.8b, v23.8b //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)//
+
+ dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ add x0,x0,#4
+ ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+ umlsl v0.8h, v4.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)//
+
+ dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+ umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
+
+ umull v8.8h, v7.8b, v23.8b
+ dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
+ umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
+ ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+ umlsl v8.8h, v6.8b, v22.8b
+ umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
+
+ dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ umlsl v8.8h, v4.8b, v24.8b
+ ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+ umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
+
+ dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ umlal v8.8h, v5.8b, v25.8b
+ ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+ umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
+
+ dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ umlal v8.8h, v6.8b, v26.8b
+ ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+ umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
+
+ dup v4.2s, v7.2s[1]
+ add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
+
+ umlsl v8.8h, v7.8b, v27.8b
+ ld1 {v4.s}[1],[x3],x2
+ umlal v8.8h, v4.8b, v28.8b
+ dup v5.2s, v4.2s[1]
+ sqrshrun v0.8b, v0.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v5.s}[1],[x3]
+ add x3,x1,x6
+ st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
+
+ umlsl v8.8h, v5.8b, v29.8b
+ st1 {v0.s}[1],[x3],x6 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
+ sqrshrun v8.8b, v8.8h,#6
+
+ st1 {v8.s}[0],[x3],x6
+ add x1,x1,#4
+ st1 {v8.s}[1],[x3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs x7,x7,#4
+ add x1,x1,x9
+ add x0,x0,x8
+ bgt outer_loop_wd_4
+
+ // ldmfd sp!, {x4-x12, x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
new file mode 100644
index 0000000..64a00b2
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
@@ -0,0 +1,407 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//* ihevc_inter_pred_filters_luma_vert_w16inp.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//* - ihevc_inter_pred_filters_luma_vert_w16inp()
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+//
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* luma vertical filter for 16bit input.
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
+//* clipped to lie between 0 and 255 assumptions : the function is
+//* optimized considering the fact width is multiple of 4. and height as
+//* multiple of 2.
+//*
+//* //param[in] pi2_src
+//* word16 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd )
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_w16inp_av8
+
+.type ihevc_inter_pred_luma_vert_w16inp_av8, %function
+
+ihevc_inter_pred_luma_vert_w16inp_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x12,x15 //load pi1_coeff
+ mov x6,x3
+ mov x5,x17 //load wd
+ ld1 {v0.8b},[x12] //coeff = vld1_s8(pi1_coeff)
+ lsl x2, x2, #1
+ sub x12,x2,x2,lsl #2 //src_ctrd & pi1_coeff
+ //abs v0.8b, v0.8b //vabs_s8(coeff)
+ add x0,x0,x12 //x0->pu1_src x12->pi1_coeff
+ mov x3,x16 //load ht
+ subs x7,x3,#0 //x3->ht
+ //ble end_loops //end loop jump
+ sxtl v0.8h, v0.8b
+ dup v22.4h, v0.4h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+ dup v23.4h, v0.4h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+ dup v24.4h, v0.4h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+ dup v25.4h, v0.4h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+ dup v26.4h, v0.4h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+ dup v27.4h, v0.4h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+ dup v28.4h, v0.4h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+ dup v29.4h, v0.4h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+
+ sub x20,x5,x6,lsl #2 //x6->dst_strd x5 ->wd
+ neg x9, x20
+ sub x20,x5,x2,lsl #2 //x2->src_strd
+ neg x8, x20
+ sub x8,x8,x5
+ lsr x3, x5, #2 //divide by 4
+ mul x7, x7, x3 //multiply height by width
+ sub x7, x7,#4 //subtract by one for epilog
+ mov x4,x5 //x5 ->wd
+ //lsl x2, x2, #1
+
+prolog:
+
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ ld1 {v1.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ subs x4,x4,#4
+ ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+ ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+
+ smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ add x20,x0,x8,lsl #0
+ csel x0, x20, x0,le
+ smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ csel x4, x5, x4,le //x5 ->wd
+ smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ sqshrn v8.4h, v8.4s,#6
+
+ ld1 {v1.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smull v12.4s, v3.4h, v23.4h
+ ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ smlal v12.4s, v2.4h, v22.4h
+ ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smlal v12.4s, v4.4h, v24.4h
+ smlal v12.4s, v5.4h, v25.4h
+ smlal v12.4s, v6.4h, v26.4h
+ smlal v12.4s, v7.4h, v27.4h
+ smlal v12.4s, v16.4h, v28.4h
+ smlal v12.4s, v17.4h, v29.4h
+ add x14,x1,x6
+ sqshrn v10.4h, v10.4s,#6
+ sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ smull v14.4s, v4.4h, v23.4h
+ smlal v14.4s, v3.4h, v22.4h
+ smlal v14.4s, v5.4h, v24.4h
+ smlal v14.4s, v6.4h, v25.4h
+ ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v7.4h, v26.4h
+ ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v16.4h, v27.4h
+ ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v17.4h, v28.4h
+ ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v18.4h, v29.4h
+ ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+ st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
+ sqshrn v12.4h, v12.4s,#6
+ sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ add x20,x1,x9
+ csel x1, x20, x1,le
+
+ subs x7,x7,#4
+
+ blt epilog_end //jumps to epilog_end
+ beq epilog //jumps to epilog
+
+kernel_8:
+
+ smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ subs x4,x4,#4
+ smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ add x20,x0,x8,lsl #0
+ csel x0, x20, x0,le
+ smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v10.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+
+ sqshrn v14.4h, v14.4s,#6
+ sqrshrun v12.8b, v12.8h,#6
+ ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+
+ smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ st1 {v12.s}[0],[x14],x6
+
+ smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+
+ smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+ sqshrn v8.4h, v8.4s,#6
+ sqrshrun v14.8b, v14.8h,#6
+
+ smull v12.4s, v3.4h, v23.4h
+ csel x4, x5, x4,le //x5 ->wd
+
+ smlal v12.4s, v2.4h, v22.4h
+ ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+
+ smlal v12.4s, v4.4h, v24.4h
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+
+ smlal v12.4s, v5.4h, v25.4h
+
+ smlal v12.4s, v6.4h, v26.4h
+ st1 {v14.s}[0],[x14],x6
+
+ smlal v12.4s, v7.4h, v27.4h
+ ld1 {v1.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+
+ smlal v12.4s, v16.4h, v28.4h
+ add x14,x1,x6
+
+ smlal v12.4s, v17.4h, v29.4h
+ ld1 {v0.4h},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+
+ sqshrn v10.4h, v10.4s,#6
+ sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ ld1 {v2.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+
+ smull v14.4s, v4.4h, v23.4h
+ smlal v14.4s, v3.4h, v22.4h
+ smlal v14.4s, v5.4h, v24.4h
+ ld1 {v3.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+ smlal v14.4s, v6.4h, v25.4h
+ ld1 {v4.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v7.4h, v26.4h
+ ld1 {v5.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v16.4h, v27.4h
+ ld1 {v6.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v17.4h, v28.4h
+ ld1 {v7.4h},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ smlal v14.4s, v18.4h, v29.4h
+ st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
+
+ sqshrn v12.4h, v12.4s,#6
+ add x20,x1,x9
+ csel x1, x20, x1,le
+
+ sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ subs x7,x7,#4
+
+ bgt kernel_8 //jumps to kernel_8
+
+epilog:
+
+ smull v8.4s, v1.4h, v23.4h //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ smlal v8.4s, v0.4h, v22.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v8.4s, v2.4h, v24.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v8.4s, v3.4h, v25.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v8.4s, v4.4h, v26.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v8.4s, v5.4h, v27.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v8.4s, v6.4h, v28.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v8.4s, v7.4h, v29.4h //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v10.s}[0],[x14],x6
+
+ sqshrn v14.4h, v14.4s,#6
+ sqrshrun v12.8b, v12.8h,#6
+
+ ld1 {v16.4h},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ smull v10.4s, v2.4h, v23.4h //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ smlal v10.4s, v1.4h, v22.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v10.4s, v3.4h, v24.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v10.4s, v4.4h, v25.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v10.4s, v5.4h, v26.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v10.4s, v6.4h, v27.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v10.4s, v7.4h, v28.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v10.4s, v16.4h, v29.4h //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v12.s}[0],[x14],x6
+
+ sqshrn v8.4h, v8.4s,#6
+ sqrshrun v14.8b, v14.8h,#6
+
+ ld1 {v17.4h},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ smull v12.4s, v3.4h, v23.4h
+ smlal v12.4s, v2.4h, v22.4h
+ smlal v12.4s, v4.4h, v24.4h
+ smlal v12.4s, v5.4h, v25.4h
+ smlal v12.4s, v6.4h, v26.4h
+ smlal v12.4s, v7.4h, v27.4h
+ smlal v12.4s, v16.4h, v28.4h
+ smlal v12.4s, v17.4h, v29.4h
+ st1 {v14.s}[0],[x14],x6
+ sqshrn v10.4h, v10.4s,#6
+ sqrshrun v8.8b, v8.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v18.4h},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ smull v14.4s, v4.4h, v23.4h
+ smlal v14.4s, v3.4h, v22.4h
+ smlal v14.4s, v5.4h, v24.4h
+ smlal v14.4s, v6.4h, v25.4h
+ smlal v14.4s, v7.4h, v26.4h
+ smlal v14.4s, v16.4h, v27.4h
+ smlal v14.4s, v17.4h, v28.4h
+ smlal v14.4s, v18.4h, v29.4h
+ sqshrn v12.4h, v12.4s,#6
+ sqrshrun v10.8b, v10.8h,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ add x14,x1,x6
+ st1 {v8.s}[0],[x1],#4 //vst1_u8(pu1_dst,sto_res)//
+
+epilog_end:
+ st1 {v10.s}[0],[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ sqrshrun v12.8b, v12.8h,#6
+
+ st1 {v12.s}[0],[x14],x6
+ sqshrn v14.4h, v14.4s,#6
+ sqrshrun v14.8b, v14.8h,#6
+
+ st1 {v14.s}[0],[x14],x6
+
+
+end_loops:
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
new file mode 100644
index 0000000..da316ae
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
@@ -0,0 +1,483 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//* ihevc_inter_pred_filters_luma_vert.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* parthiban v
+//*
+//* //par list of functions:
+//*
+//* - ihevc_inter_pred_luma_vert()
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+
+
+//void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
+// word16 *pi2_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd )
+
+//**************variables vs registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x6 => dst_strd
+// x12 => *pi1_coeff
+// x5 => ht
+// x3 => wd
+
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_w16out_av8
+
+.type ihevc_inter_pred_luma_vert_w16out_av8, %function
+
+ihevc_inter_pred_luma_vert_w16out_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x12,x15 //load pi1_coeff
+ mov x6,x3
+ mov x5,x17 //load wd
+ ld1 {v0.8b},[x12] //coeff = vld1_s8(pi1_coeff)
+ sub x12,x2,x2,lsl #2 //src_ctrd & pi1_coeff
+ abs v0.8b, v0.8b //vabs_s8(coeff)
+ add x0,x0,x12 //x0->pu1_src x12->pi1_coeff
+ mov x3,x16 //load ht
+ subs x7,x3,#0 //x3->ht
+ //ble end_loops_16out //end loop jump
+ dup v22.8b, v0.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+ cmp x5,#8
+ dup v23.8b, v0.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+ dup v24.8b, v0.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+ dup v25.8b, v0.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+ dup v26.8b, v0.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+ dup v27.8b, v0.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+ dup v28.8b, v0.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+ dup v29.8b, v0.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+ blt core_loop_wd_4_16out //core loop wd 4 jump
+ stp x0,x1, [sp, #-16]!
+
+ bic x4,x5,#7 //x5 ->wd
+ sub x20,x4,x6,lsl #2 //x6->dst_strd x5 ->wd
+ neg x9, x20
+ sub x20,x4,x2,lsl #2 //x2->src_strd
+ neg x8, x20
+ lsl x6, x6, #1
+ lsr x3, x5, #3 //divide by 8
+ mul x7, x7, x3 //multiply height by width
+ sub x7, x7,#4 //subtract by one for epilog
+
+prolog_16out:
+
+ and x10, x0, #31
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+
+ ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ subs x4,x4,#8
+ ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+
+ add x20,x0,x8
+ csel x0, x20, x0,le
+ umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+ bic x20,x5,#7 //x5 ->wd
+ csel x4, x20, x4,le
+ umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+ ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+ add x20,x20,x3
+ prfm PLDL1KEEP,[x20]
+ umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ add x20,x3, x2
+ prfm PLDL1KEEP,[x20]
+ umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ add x20,x3, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ add x3, x3, x2
+ umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ add x20,x3, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ umull v12.8h, v3.8b, v23.8b
+ ld1 {v1.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v12.8h, v2.8b, v22.8b
+ ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v12.8h, v4.8b, v24.8b
+ ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlal v12.8h, v5.8b, v25.8b
+ umlal v12.8h, v6.8b, v26.8b
+ umlsl v12.8h, v7.8b, v27.8b
+ umlal v12.8h, v16.8b, v28.8b
+ umlsl v12.8h, v17.8b, v29.8b
+ add x14,x1,x6
+ st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
+ //vqrshrun.s16 d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ add x20,x1,x9,lsl #1
+ csel x1, x20, x1,le
+
+ umull v14.8h, v4.8b, v23.8b
+ subs x7,x7,#4
+ umlsl v14.8h, v3.8b, v22.8b
+ umlsl v14.8h, v5.8b, v24.8b
+ umlal v14.8h, v6.8b, v25.8b
+ ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v7.8b, v26.8b
+ ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v16.8b, v27.8b
+ ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v17.8b, v28.8b
+ ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v18.8b, v29.8b
+ ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+
+ st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ //vqrshrun.s16 d12,q6,#6
+
+
+ blt epilog_end_16out
+ beq epilog_16out //jumps to epilog
+
+kernel_8_16out:
+
+ subs x4,x4,#8
+ umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+
+ add x20,x0,x8
+ csel x0, x20, x0,le
+ umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+
+ ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+
+ ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+
+ bic x20,x5,#7 //x5 ->wd
+ csel x4, x20, x4,le
+ umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+
+ ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+
+ st1 {v12.16b},[x14],x6
+ umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+
+// and x11, x0, #31
+ umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+
+ st1 {v14.16b},[x14],x6
+ umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+
+ add x14,x1,x6
+ umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+
+ ld1 {v0.8b},[x0],#8 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+
+ ld1 {v1.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+
+ st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
+ umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+
+ add x20,x1,x9,lsl #1
+ csel x1, x20, x1,le
+ umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+
+// cmp x11, x10
+ umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+ add x10, x3, x2, lsl #3 // 10*strd - 8+2
+ umull v12.8h, v3.8b, v23.8b
+
+ add x10, x10, x2 // 11*strd
+ umlsl v12.8h, v2.8b, v22.8b
+
+ add x20,x20,x10
+ prfm PLDL1KEEP,[x20] //11+ 0
+ umlsl v12.8h, v4.8b, v24.8b
+
+ add x20,x10, x2
+ prfm PLDL1KEEP,[x20] //11+ 1*strd
+ umlal v12.8h, v5.8b, v25.8b
+
+ add x20,x10, x2, lsl #1
+ prfm PLDL1KEEP,[x20] //11+ 2*strd
+ umlal v12.8h, v6.8b, v26.8b
+
+ add x10, x10, x2 //12*strd
+ umlsl v12.8h, v7.8b, v27.8b
+
+ add x20,x10, x2, lsl #1
+ prfm PLDL1KEEP,[x20] //11+ 3*strd
+ umlal v12.8h, v16.8b, v28.8b
+
+// mov x10, x11
+ umlsl v12.8h, v17.8b, v29.8b
+
+ ld1 {v2.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umull v14.8h, v4.8b, v23.8b
+
+ subs x7,x7,#4
+ umlsl v14.8h, v3.8b, v22.8b
+
+ st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ umlsl v14.8h, v5.8b, v24.8b
+
+ ld1 {v3.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v6.8b, v25.8b
+
+ ld1 {v4.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v7.8b, v26.8b
+
+ ld1 {v5.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v16.8b, v27.8b
+
+ ld1 {v6.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umlal v14.8h, v17.8b, v28.8b
+
+ ld1 {v7.8b},[x3],x2 //src_tmp4 = vld1_u8(pu1_src_tmp)//
+ umlsl v14.8h, v18.8b, v29.8b
+
+
+ bgt kernel_8_16out //jumps to kernel_8
+
+epilog_16out:
+
+ umull v8.8h, v1.8b, v23.8b //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
+ umlsl v8.8h, v0.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)//
+ umlsl v8.8h, v2.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)//
+ umlal v8.8h, v3.8b, v25.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ umlal v8.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ umlsl v8.8h, v5.8b, v27.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)//
+ umlal v8.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ umlsl v8.8h, v7.8b, v29.8b //mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v12.16b},[x14],x6
+
+ //vqrshrun.s16 d14,q7,#6
+
+ ld1 {v16.8b},[x3],x2 //src_tmp1 = vld1_u8(pu1_src_tmp)//
+ umull v10.8h, v2.8b, v23.8b //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
+ umlsl v10.8h, v1.8b, v22.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)//
+ umlsl v10.8h, v3.8b, v24.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)//
+ umlal v10.8h, v4.8b, v25.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ umlal v10.8h, v5.8b, v26.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ umlsl v10.8h, v6.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)//
+ umlal v10.8h, v7.8b, v28.8b //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ umlsl v10.8h, v16.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v14.16b},[x14],x6
+
+ //vqrshrun.s16 d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v17.8b},[x3],x2 //src_tmp2 = vld1_u8(pu1_src_tmp)//
+ umull v12.8h, v3.8b, v23.8b
+ umlsl v12.8h, v2.8b, v22.8b
+ umlsl v12.8h, v4.8b, v24.8b
+ umlal v12.8h, v5.8b, v25.8b
+ umlal v12.8h, v6.8b, v26.8b
+ umlsl v12.8h, v7.8b, v27.8b
+ umlal v12.8h, v16.8b, v28.8b
+ umlsl v12.8h, v17.8b, v29.8b
+ add x14,x1,x6
+ st1 {v8.16b},[x1],#16 //vst1_u8(pu1_dst,sto_res)//
+ //vqrshrun.s16 d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v18.8b},[x3],x2 //src_tmp3 = vld1_u8(pu1_src_tmp)//
+ umull v14.8h, v4.8b, v23.8b
+ umlsl v14.8h, v3.8b, v22.8b
+ umlsl v14.8h, v5.8b, v24.8b
+ umlal v14.8h, v6.8b, v25.8b
+ umlal v14.8h, v7.8b, v26.8b
+ umlsl v14.8h, v16.8b, v27.8b
+ umlal v14.8h, v17.8b, v28.8b
+ umlsl v14.8h, v18.8b, v29.8b
+
+ st1 {v10.16b},[x14],x6 //vst1_u8(pu1_dst_tmp,sto_res)//
+ //vqrshrun.s16 d12,q6,#6
+
+epilog_end_16out:
+ st1 {v12.16b},[x14],x6
+ //vqrshrun.s16 d14,q7,#6
+
+ st1 {v14.16b},[x14],x6
+
+
+end_loops_16out:
+ tst x5,#7
+ ldp x0,x1, [sp], #16
+
+ // ldmeqfd sp!,{x4-x12,x15} //reload the registers from sp
+ bne lbl355
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+lbl355:
+ mov x5, #4
+ add x0, x0, #8
+ add x1, x1, #16
+ mov x7, #16
+ lsr x6, x6, #1
+
+ //
+
+core_loop_wd_4_16out:
+ sub x20,x5,x6,lsl #2 //x6->dst_strd x5 ->wd
+ neg x9, x20
+ sub x20,x5,x2,lsl #2 //x2->src_strd
+ neg x8, x20
+ movi v4.8b, #0
+ lsl x6, x6, #1
+
+outer_loop_wd_4_16out:
+ subs x12,x5,#0
+ ble end_inner_loop_wd_4_16out //outer loop jump
+
+inner_loop_wd_4_16out:
+ add x3,x0,x2
+ ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+ subs x12,x12,#4
+ dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+ ld1 {v4.s}[0],[x0] //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)//
+ umull v0.8h, v5.8b, v23.8b //mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)//
+
+ dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ add x0,x0,#4
+ ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+ umlsl v0.8h, v4.8b, v22.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)//
+
+ dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+ umlsl v0.8h, v6.8b, v24.8b //mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)//
+
+ umull v8.8h, v7.8b, v23.8b
+ dup v4.2s, v7.2s[1] //src_tmp1 = vdup_lane_u32(src_tmp4, 1)//
+ umull v2.8h, v7.8b, v25.8b //mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)//
+ ld1 {v4.s}[1],[x3],x2 //src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)//
+ umlsl v8.8h, v6.8b, v22.8b
+ umlal v0.8h, v4.8b, v26.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)//
+
+ dup v5.2s, v4.2s[1] //src_tmp2 = vdup_lane_u32(src_tmp1, 1)//
+ umlsl v8.8h, v4.8b, v24.8b
+ ld1 {v5.s}[1],[x3],x2 //src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)//
+ umlsl v2.8h, v5.8b, v27.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)//
+
+ dup v6.2s, v5.2s[1] //src_tmp3 = vdup_lane_u32(src_tmp2, 1)//
+ umlal v8.8h, v5.8b, v25.8b
+ ld1 {v6.s}[1],[x3],x2 //src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)//
+ umlal v0.8h, v6.8b, v28.8b //mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)//
+
+ dup v7.2s, v6.2s[1] //src_tmp4 = vdup_lane_u32(src_tmp3, 1)//
+ umlal v8.8h, v6.8b, v26.8b
+ ld1 {v7.s}[1],[x3],x2 //src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)//
+ umlsl v2.8h, v7.8b, v29.8b //mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)//
+
+ dup v4.2s, v7.2s[1]
+ add v0.8h, v0.8h , v2.8h //mul_res1 = vaddq_u16(mul_res1, mul_res2)//
+
+ umlsl v8.8h, v7.8b, v27.8b
+ ld1 {v4.s}[1],[x3],x2
+ umlal v8.8h, v4.8b, v28.8b
+ dup v5.2s, v4.2s[1]
+ //vqrshrun.s16 d0,q0,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v5.s}[1],[x3]
+ add x3,x1,x6
+ st1 {v0.d}[0],[x1],#8 //vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)//
+
+ umlsl v8.8h, v5.8b, v29.8b
+ st1 {v0.d}[1],[x3],x6 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)//
+ //vqrshrun.s16 d8,q4,#6
+
+ st1 {v8.d}[0],[x3],x6
+ //add x1,x1,#4
+ st1 {v8.d}[1],[x3]
+ bgt inner_loop_wd_4_16out
+
+end_inner_loop_wd_4_16out:
+ subs x7,x7,#4
+ add x1,x1,x9,lsl #1
+ add x0,x0,x8
+ bgt outer_loop_wd_4_16out
+
+ // ldmfd sp!, {x4-x12, x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_luma_copy.s b/common/arm64/ihevc_inter_pred_luma_copy.s
new file mode 100644
index 0000000..dccbb2b
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_copy.s
@@ -0,0 +1,199 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* interprediction luma function for copy
+//*
+//* //par description:
+//* copies the array of width 'wd' and height 'ht' from the location pointed
+//* by 'src' to the location pointed by 'dst'
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_inter_pred_luma_copy (
+// uword8 *pu1_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd )
+
+//**************variables vs registers*****************************************
+// x0 => *pu1_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x11 => ht
+// x16 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_copy_av8
+
+.type ihevc_inter_pred_luma_copy_av8, %function
+
+ihevc_inter_pred_luma_copy_av8:
+ // stmfd sp!, {x8-x16, lr} //stack stores the values of the arguments
+ stp x19,x20,[sp, #-16]!
+ mov x16,x6 //loads wd
+ mov x11,x5 //loads ht
+ cmp x11,#0 //checks ht == 0
+ ble end_loops
+ tst x16,#15 //checks wd for multiples for 4 & 8
+ beq core_loop_wd_16
+ tst x16,#7 //checks wd for multiples for 4 & 8
+ beq core_loop_wd_8
+ sub x15,x16,#4
+
+outer_loop_wd_4:
+ subs x8,x16,#0 //checks wd == 0
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ ld1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add x9,x0,x2 //pu1_src_tmp += src_strd
+ add x10,x1,x3 //pu1_dst_tmp += dst_strd
+ st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add x0,x0,#4 //pu1_src += 4
+ st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ subs x8,x8,#4 //(wd -4)
+ st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+ ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
+ add x1,x1,#4 //pu1_dst += 4
+ st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
+
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs x11,x11,#4 //ht - 4
+ sub x0,x9,x15 //pu1_src = pu1_src_tmp
+ sub x1,x10,x15 //pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_4
+
+end_loops:
+ // ldmfd sp!,{x8-x16,pc} //reload the registers from sp
+// MRS x20,PMCCFILTR_EL0
+ sub x0,x20,x19
+ ldp x19,x20,[sp],#16
+ ret
+
+
+core_loop_wd_8:
+ sub x15,x16,#8
+
+outer_loop_wd_8:
+ subs x8,x16,#0 //checks wd
+ ble end_inner_loop_wd_8
+
+inner_loop_wd_8:
+ add x9,x0,x2 //pu1_src_tmp += src_strd
+ ld1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ add x10,x1,x3 //pu1_dst_tmp += dst_strd
+ st1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 {v1.8b},[x9],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v1.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ subs x8,x8,#8 //wd - 8(loop condition)
+ ld1 {v2.8b},[x9],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v2.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 {v3.8b},[x9],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v3.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_8
+
+end_inner_loop_wd_8:
+ subs x11,x11,#4 //ht -= 4
+ sub x0,x9,x15 //pu1_src = pu1_src_tmp
+ sub x1,x10,x15 //pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_8
+
+ // ldmfd sp!,{x8-x16,pc} //reload the registers from sp
+// MRS x20,PMCCFILTR_EL0
+ sub x0,x20,x19
+ ldp x19,x20,[sp],#16
+ ret
+
+core_loop_wd_16:
+ sub x15,x16,#16
+
+outer_loop_wd_16:
+ subs x8,x16,#0 //checks wd
+ ble end_inner_loop_wd_16
+
+inner_loop_wd_16:
+ add x9,x0,x2 //pu1_src_tmp += src_strd
+ ld1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp)
+ add x10,x1,x3 //pu1_dst_tmp += dst_strd
+ st1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 {v1.16b},[x9],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v1.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ subs x8,x8,#16 //wd - 8(loop condition)
+ ld1 {v2.16b},[x9],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v2.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ ld1 {v3.16b},[x9],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v3.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src)
+ bgt inner_loop_wd_16
+
+end_inner_loop_wd_16:
+ subs x11,x11,#4 //ht -= 4
+ sub x0,x9,x15 //pu1_src = pu1_src_tmp
+ sub x1,x10,x15 //pu1_dst = pu1_dst_tmp
+ bgt outer_loop_wd_16
+
+ // ldmfd sp!,{x8-x16,pc} //reload the registers from sp
+// MRS x20,PMCCFILTR_EL0
+ sub x0,x20,x19
+ ldp x19,x20,[sp],#16
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
new file mode 100644
index 0000000..86ffdba
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
@@ -0,0 +1,272 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* interprediction luma function for copy
+//*
+//* //par description:
+//* copies the array of width 'wd' and height 'ht' from the location pointed
+//* by 'src' to the location pointed by 'dst'
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_copy_w16out (
+// uword8 *pu1_src,
+// word16 *pi2_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd )
+
+//**************variables vs registers*****************************************
+// x0 => *pu1_src
+// x1 => *pi2_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x7 => ht
+// x12 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_copy_w16out_av8
+
+.type ihevc_inter_pred_luma_copy_w16out_av8, %function
+
+ihevc_inter_pred_luma_copy_w16out_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+ mov x12,x17 //loads wd
+ mov x7,x16 //loads ht
+ cmp x7,#0 //ht condition(ht == 0)
+ ble end_loops //loop
+ tst x12,#7 //conditional check for wd (multiples)
+ beq core_loop_wd_8
+ sub x11,x12,#4
+ lsl x6, x3,#1
+ adds x6, x6,#0
+
+outer_loop_wd_4:
+ subs x4,x12,#0 //wd conditional subtract
+ ble end_inner_loop_wd_4
+
+inner_loop_wd_4:
+ ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp)
+ add x5,x0,x2 //pu1_src +src_strd
+ uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ add x10,x1,x6
+ subs x4,x4,#4 //wd - 4
+ shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6)
+ ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ add x0,x0,#4 //pu1_src += 4
+ st1 {v0.d}[0],[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ add x1,x1,#8
+ uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6)
+ uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ st1 {v22.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6)
+ ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp)
+ st1 {v24.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6)
+ st1 {v26.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4:
+ subs x7,x7,#4 //ht + 4
+ sub x0,x5,x11
+ sub x1,x10,x11,lsl #1
+ bgt outer_loop_wd_4
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+
+ ret
+
+
+core_loop_wd_8:
+ //sub x11,x12,#8
+ lsl x5, x3,#1
+ adds x5, x5,#0
+ sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width
+ neg x11, x20
+ sub x20,x12,x2,lsl #2 //x2->src_strd
+ neg x8, x20
+ lsr x4, x12, #3 // divide by 8
+ mul x7, x7, x4
+ sub x4,x12,#0 //wd conditional check
+ sub x7,x7,#4 //subtract one for epilog
+
+prolog:
+ add x6,x0,x2 //pu1_src_tmp += src_strd
+ add x10,x1,x5
+ ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+ uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ subs x4,x4,#8 //wd decrements by 8
+ shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
+ add x20,x0,x8
+ csel x0, x20, x0,le
+ add x6,x0,x2 //pu1_src_tmp += src_strd
+ ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+
+ st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
+ add x20,x1,x11,lsl #1
+ csel x1, x20, x1,le
+ sub x20,x12,#0 //wd conditional check
+ csel x4, x20, x4,le
+
+ subs x7,x7,#4 //ht - 4
+
+ blt epilog_end //jumps to epilog_end
+ beq epilog //jumps to epilog
+
+
+
+outer_loop_wd_8:
+
+ st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ subs x4,x4,#8 //wd decrements by 8
+ add x20,x0,x8
+ csel x0, x20, x0,le
+
+ add x6,x0,x2 //pu1_src_tmp += src_strd
+
+ ld1 {v8.8b},[x0],#8 //vld1_u8(pu1_src_tmp)
+ shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ ld1 {v10.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ ld1 {v12.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ ld1 {v14.8b},[x6],x2 //vld1_u8(pu1_src_tmp)
+ add x10,x1,x5
+
+ shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
+
+ add x20,x1,x11,lsl #1
+ csel x1, x20, x1,le
+ sub x20,x12,#0 //wd conditional check
+ csel x4, x20, x4,le
+
+ subs x7,x7,#4 //ht - 4
+ bgt outer_loop_wd_8
+
+epilog:
+ st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v16.8h, v8.8b //vmovl_u8(vld1_u8(pu1_src_tmp))
+
+ st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v18.8h, v10.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ uxtl v20.8h, v12.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+
+ uxtl v22.8h, v14.8b //vmovl_u8(vld1_u8(pu1_src_tmp)
+ //add x6,x0,x2 //pu1_src_tmp += src_strd
+
+ shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6)
+ shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6)
+ add x10,x1,x5
+ shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6)
+
+ st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp)
+epilog_end:
+ st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+ st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp)
+
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_luma_horz_w16out.s b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
new file mode 100644
index 0000000..f7b6644
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
@@ -0,0 +1,678 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+
+///**
+//******************************************************************************
+//* //file
+//* ihevc_inter_pred_luma_horz_w16out.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* parthiban v
+//*
+//* //par list of functions:
+//*
+//* - ihevc_inter_pred_luma_horz_w16out()
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* interprediction luma filter for horizontal 16bit output
+//*
+//* //par description:
+//* applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+//* to the elements pointed by 'pu1_src' and writes to the location pointed
+//* by 'pu1_dst' no downshifting or clipping is done and the output is used
+//* as an input for vertical filtering or weighted prediction assumptions :
+//* the function is optimized considering the fact width is multiple of 4 or
+//* 8. if width is multiple of 4 then height should be multiple of 2, width 8
+//* is optimized further.
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[out] pi2_dst
+//* word16 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
+// word16 *pi2_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd
+
+
+//x0 - free
+//x1 - dst_ptr
+//x2 - src_strd
+//x3 - dst_strd
+//x8 - src_ptx2
+//x9 - inner loop counter
+//x10 - dst_ptx2
+//x11 - free
+//x12 - dst_strd2
+//x13 - src_strd1
+//x14 - wd
+//x15 - #1
+//x16 - src_ptx1
+//x19 - loop_counter
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_horz_w16out_av8
+
+.type ihevc_inter_pred_luma_horz_w16out_av8, %function
+
+ihevc_inter_pred_luma_horz_w16out_av8:
+
+ // stmfd sp!, {x8-x16, x19} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ mov x20,#1
+ bic x19, x19, x20 // clearing bit[0], so that it goes back to mode
+ mov x8,x4 //loads pi1_coeff
+ mov x11,x5 //loads ht
+
+
+ ld1 {v0.8b},[x8] //coeff = vld1_s8(pi1_coeff)
+ sub x19,x11,#0 //checks for ht == 0
+ abs v2.8b, v0.8b //vabs_s8(coeff)
+ mov x15,#1
+ //ble end_loops
+ mov x14,x6 //loads wd
+ dup v24.8b, v2.8b[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
+ sub x16,x0,#3 //pu1_src - 3
+ dup v25.8b, v2.8b[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
+ add x8,x16,x2 //pu1_src_tmp2_8 = pu1_src + src_strd
+ dup v26.8b, v2.8b[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
+ sub x20,x14,x2,lsl #1 //2*src_strd - wd
+ neg x13, x20
+ dup v27.8b, v2.8b[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
+ sub x20,x14,x3 //dst_strd - wd
+ neg x12, x20
+ dup v28.8b, v2.8b[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
+
+ dup v29.8b, v2.8b[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
+ and x11,x19,#1 //calculating ht_residue ht_residue = (ht & 1)
+ dup v30.8b, v2.8b[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
+ sub x19,x19,x11 //decrement height by ht_residue(residue value is calculated outside)
+ dup v31.8b, v2.8b[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
+
+ cmp x11,#1
+ beq odd_height_decision
+
+even_height_decision:
+ mov x11,x1
+ cmp x14,#4
+ ble outer_loop_4
+
+ cmp x14,#24
+ mov x20,#16
+ csel x14, x20, x14,eq
+ add x20, x12,#8
+ csel x12, x20, x12,eq
+ add x20, x13,#8
+ csel x13, x20, x13,eq
+
+ cmp x14,#16
+ bge outer_loop_16_branch
+
+ cmp x14,#12
+ add x20, x12,#4
+ csel x12, x20, x12,eq
+ add x20, x13,#4
+ csel x13, x20, x13,eq
+outer_loop_8_branch:
+ b outer_loop_8
+
+outer_loop_16_branch:
+ b outer_loop_16
+
+
+odd_height_decision:
+ cmp x14,#24
+ beq outer_loop_8_branch
+ cmp x14,#12
+ beq outer_loop_4
+ b even_height_decision
+
+outer_loop4_residual:
+ sub x16,x0,#3 //pu1_src - 3
+ mov x1,x11
+ add x1, x1,#16
+ mov x14,#4
+ add x16, x16,#8
+ mov x19,#16
+ add x12, x12,#4
+ add x13, x13,#4
+
+outer_loop_4:
+ add x10,x1,x3,lsl #1 //pu1_dst + dst_strd
+ add x8,x16,x2 //pu1_src + src_strd
+
+ subs x9,x14,#0 //checks wd
+ ble end_inner_loop_4
+
+inner_loop_4:
+ mov x15,#1
+ ld1 {v20.2s},[x16],x15 //vector load pu1_src
+ ld1 {v21.2s},[x16],x15
+ ld1 {v22.2s},[x8],x15 //vector load pu1_src + src_strd
+ ld1 {v23.2s},[x8],x15
+
+ zip1 v0.2s, v20.2s, v22.2s
+ zip2 v12.2s, v20.2s, v22.2s //vector zip the i iteration and ii interation in single register
+ zip1 v1.2s, v21.2s, v23.2s
+ zip2 v13.2s, v21.2s, v23.2s
+
+ ld1 {v20.2s},[x16],x15
+ ld1 {v21.2s},[x16],x15
+ ld1 {v22.2s},[x8],x15
+ ld1 {v23.2s},[x8],x15
+
+ zip1 v2.2s, v20.2s, v22.2s
+ zip2 v14.2s, v20.2s, v22.2s
+ zip1 v3.2s, v21.2s, v23.2s
+ zip2 v15.2s, v21.2s, v23.2s
+
+ ld1 {v20.2s},[x16],x15
+ ld1 {v21.2s},[x16],x15
+ ld1 {v22.2s},[x8],x15
+ ld1 {v23.2s},[x8],x15
+
+ zip1 v4.2s, v20.2s, v22.2s
+ zip2 v16.2s, v20.2s, v22.2s
+ zip1 v5.2s, v21.2s, v23.2s
+ zip2 v17.2s, v21.2s, v23.2s
+
+ ld1 {v20.2s},[x16],x15
+ ld1 {v21.2s},[x16],x15
+ ld1 {v22.2s},[x8],x15
+ ld1 {v23.2s},[x8],x15
+
+ //add x16,x16,#4 //increment the input pointer
+ sub x16,x16,#4
+ //vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ //vext.u8 d3,d0,d1,#3 //vector extract of src[0_3]
+ //vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+
+ //vext.u8 d5,d0,d1,#5 //vector extract of src[0_5]
+ //vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+ //vext.u8 d7,d0,d1,#7 //vector extract of src[0_7]
+ //vext.u8 d1,d0,d1,#1 //vector extract of src[0_1]
+ sub x8,x8,#4
+ // add x8,x8,#4 //increment the input pointer
+ // vext.u8 d14,d12,d13,#2 //vector extract of src[0_2]
+ // vext.u8 d15,d12,d13,#3 //vector extract of src[0_3]
+ // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5]
+ // vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+ // vext.u8 d19,d12,d13,#7 //vector extract of src[0_7]
+ //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1]
+
+
+
+
+
+
+ zip1 v6.2s, v20.2s, v22.2s
+ zip2 v18.2s, v20.2s, v22.2s
+ zip1 v7.2s, v21.2s, v23.2s
+ zip2 v19.2s, v21.2s, v23.2s
+
+ umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
+ umlsl v8.8h, v0.8b, v24.8b
+ umlsl v8.8h, v2.8b, v26.8b
+ umlal v8.8h, v3.8b, v27.8b
+ umlal v8.8h, v4.8b, v28.8b
+ umlsl v8.8h, v5.8b, v29.8b
+ umlal v8.8h, v6.8b, v30.8b
+ umlsl v8.8h, v7.8b, v31.8b
+
+ // vqrshrun.s16 d8,q4,#6 //narrow right shift and saturating the result
+ st1 {v8.d}[0],[x1],#8 //store the i iteration result which is in upper part of the register
+ st1 {v8.d}[1],[x10],#8 //store the ii iteration result which is in lower part of the register
+ subs x9,x9,#4 //decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4:
+ subs x19,x19,#2 //decrement the ht by 4
+ add x16,x16,x13 //increment the input pointer 2*src_strd-wd
+ add x1,x10,x12,lsl #1 //increment the output pointer 2*dst_strd-wd
+ bgt outer_loop_4
+
+
+height_residue_4:
+
+ mov x11,x5 //loads ht
+ and x11,x11,#1 //calculating ht_residue ht_residue = (ht & 1)
+ cmp x11,#0
+ //beq end_loops
+ // ldmeqfd sp!,{x8-x16,pc} //reload the registers from sp
+ bne lbl280
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+lbl280:
+
+outer_loop_height_residue_4:
+
+
+ subs x9,x14,#0 //checks wd
+ ble end_inner_loop_height_residue_4
+
+inner_loop_height_residue_4:
+ mov x15, #1
+ ld1 {v0.2s},[x16],x15 //vector load pu1_src
+ ld1 {v1.2s},[x16],x15
+
+
+
+
+
+
+ // vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ // vext.u8 d3,d0,d1,#3 //vector extract of src[0_3]
+ // vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+
+
+
+ //add x16,x16,#4 //increment the input pointer
+ // vext.u8 d5,d0,d1,#5 //vector extract of src[0_5]
+ // vext.u8 d6,d0,d1,#6 //vector extract of src[0_6]
+ // vext.u8 d7,d0,d1,#7 //vector extract of src[0_7]
+ // vext.u8 d1,d0,d1,#1 //vector extract of src[0_1]
+ ld1 {v2.2s},[x16],x15
+ umull v8.8h, v1.8b, v25.8b //arithmetic operations for ii iteration in the same time
+ ld1 {v3.2s},[x16],x15
+ umlsl v8.8h, v0.8b, v24.8b
+ ld1 {v4.2s},[x16],x15
+ umlsl v8.8h, v2.8b, v26.8b
+ ld1 {v5.2s},[x16],x15
+ umlal v8.8h, v3.8b, v27.8b
+ ld1 {v6.2s},[x16],x15
+ umlal v8.8h, v4.8b, v28.8b
+ ld1 {v7.2s},[x16],x15
+ umlsl v8.8h, v5.8b, v29.8b
+ sub x16,x16,#4
+ umlal v8.8h, v6.8b, v30.8b
+ umlsl v8.8h, v7.8b, v31.8b //store the i iteration result which is in upper part of the register
+ subs x9,x9,#4 //decrement the wd by 4
+ st1 {v8.d}[0],[x1],#8
+ bgt inner_loop_height_residue_4
+
+end_inner_loop_height_residue_4:
+ subs x11,x11,#1 //decrement the ht by 4
+ sub x20,x14,x2
+ neg x13, x20
+ add x16,x16,x13 //increment the input pointer src_strd-wd
+ add x1,x1,x12 //increment the output pointer dst_strd-wd
+ bgt outer_loop_height_residue_4
+
+ // ldmfd sp!,{x8-x16,pc} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+outer_loop8_residual:
+ sub x16,x0,#3 //pu1_src - 3
+ mov x1,x11
+ mov x19,#32
+ add x1, x1,#32
+ add x16, x16,#16
+ mov x14,#8
+ add x12, x12,#8
+ add x13, x13,#8
+
+outer_loop_8:
+
+ add x10,x1,x3,lsl #1 //pu1_dst + dst_strd
+ add x8,x16,x2 //pu1_src + src_strd
+ subs x9,x14,#0 //checks wd
+
+ ble end_inner_loop_8
+
+inner_loop_8:
+ mov x15, #1
+ ld1 {v0.2s},[x16],x15 //vector load pu1_src
+ ld1 {v1.2s},[x16],x15
+ ld1 {v2.2s},[x16],x15
+ ld1 {v3.2s},[x16],x15
+
+
+
+
+
+ // vext.u8 d2,d0,d1,#2 //vector extract of src[0_2]
+ // vext.u8 d3,d0,d1,#3 //vector extract of src[0_3]
+ // vext.u8 d4,d0,d1,#4 //vector extract of src[0_4]
+ // vext.u8 d5,d0,d1,#5 //vector extract of src[0_5]
+ // vext.u8 d6,d0,d1,#6 //vector extract of src [0_6]
+ // vext.u8 d7,d0,d1,#7 //vector extract of src[0_7]
+ // vext.u8 d1,d0,d1,#1 //vector extract of src[0_1]
+ // vext.u8 d14,d12,d13,#2
+
+ //vext.u8 d15,d12,d13,#3 //vector extract of src[0_3]
+ // vext.u8 d16,d12,d13,#4 //vector extract of src[0_4]
+ // vext.u8 d17,d12,d13,#5 //vector extract of src[0_5]
+ //vext.u8 d18,d12,d13,#6 //vector extract of src[0_6]
+ //vext.u8 d19,d12,d13,#7 //vector extract of src[0_7]
+ //vext.u8 d13,d12,d13,#1 //vector extract of src[0_1]
+ ld1 {v4.2s},[x16],x15
+ umull v8.8h, v1.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 {v5.2s},[x16],x15
+ umlal v8.8h, v3.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v6.2s},[x16],x15
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ ld1 {v7.2s},[x16],x15
+ umlsl v8.8h, v2.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ ld1 {v12.2s},[x8],x15 //vector load pu1_src + src_strd
+ umlal v8.8h, v4.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ ld1 {v13.2s},[x8],x15
+ umlsl v8.8h, v5.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ ld1 {v14.2s},[x8],x15
+ umlal v8.8h, v6.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ ld1 {v15.2s},[x8],x15
+ umlsl v8.8h, v7.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+ ld1 {v16.2s},[x8],x15 //vector load pu1_src + src_strd
+
+ umull v10.8h, v15.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v17.2s},[x8],x15
+ umlsl v10.8h, v14.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ ld1 {v18.2s},[x8],x15
+ umlal v10.8h, v16.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ ld1 {v19.2s},[x8],x15 //vector load pu1_src + src_strd
+ umlsl v10.8h, v17.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ // vqrshrun.s16 d20,q4,#6 //right shift and saturating narrow result 1
+ umlal v10.8h, v18.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ umlsl v10.8h, v19.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+ st1 {v8.8h},[x1],#16 //store the result pu1_dst
+ umlsl v10.8h, v12.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ umlal v10.8h, v13.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+
+
+ // vqrshrun.s16 d8,q5,#6 //right shift and saturating narrow result 2
+ subs x9,x9,#8 //decrement the wd loop
+ st1 {v10.8h},[x10],#16 //store the result pu1_dst
+ cmp x9,#4
+ bgt inner_loop_8
+
+end_inner_loop_8:
+ subs x19,x19,#2 //decrement the ht loop
+ add x16,x16,x13 //increment the src pointer by 2*src_strd-wd
+ add x1,x10,x12,lsl #1 //increment the dst pointer by 2*dst_strd-wd
+ bgt outer_loop_8
+
+
+
+
+
+ mov x14,x6 //loads wd
+ cmp x14,#12
+
+ beq outer_loop4_residual
+
+ mov x11,x5 //loads ht
+ and x11,x11,#1
+ cmp x11,#1
+ beq height_residue_4
+
+//end_loops
+
+ // ldmfd sp!,{x8-x16,pc} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+outer_loop_16:
+ mov x15, #-7
+ stp x0,x11,[sp,#-16]!
+ add x10,x1,x3,lsl #1 //pu1_dst + dst_strd
+ add x8,x16,x2 //pu1_src + src_strd
+ and x0, x16, #31
+ sub x9,x14,#0 //checks wd
+ //ble end_loops1
+ add x20,x16, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ ld1 {v0.2s},[x16],#8 //vector load pu1_src
+ ld1 {v1.2s},[x16],x15 //vector load pu1_src
+ add x20,x8, x2, lsl #1
+ prfm PLDL1KEEP,[x20]
+ ld1 {v2.2s},[x16],#8
+ ld1 {v3.2s},[x16],x15
+ ld1 {v4.2s},[x16],#8
+ ld1 {v5.2s},[x16],x15
+ ld1 {v6.2s},[x16],#8
+ ld1 {v7.2s},[x16],x15
+ ld1 {v12.2s},[x16],#8
+ ld1 {v13.2s},[x16],x15
+ umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 {v14.2s},[x16],#8
+ ld1 {v15.2s},[x16],x15
+ umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v16.2s},[x16],#8
+ ld1 {v17.2s},[x16],x15
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ ld1 {v18.2s},[x16],#8
+ ld1 {v19.2s},[x16],x15
+ umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+
+inner_loop_16:
+
+
+ subs x9,x9,#16
+ umull v20.8h, v3.8b, v25.8b
+
+ add x16, x16,#8
+ umlsl v20.8h, v1.8b, v24.8b
+
+ ld1 {v0.2s},[x8],#8 //vector load pu1_src
+ ld1 {v1.2s},[x8],x15 //vector load pu1_src
+ umlal v20.8h, v7.8b, v27.8b
+
+ ld1 {v2.2s},[x8],#8
+ ld1 {v3.2s},[x8],x15
+ umlsl v20.8h, v5.8b, v26.8b
+
+ ld1 {v4.2s},[x8],#8
+ ld1 {v5.2s},[x8],x15
+ umlal v20.8h, v13.8b, v28.8b
+
+ ld1 {v6.2s},[x8],#8
+ ld1 {v7.2s},[x8],x15
+ umlal v20.8h, v17.8b, v30.8b
+
+ ld1 {v12.2s},[x8],#8
+ ld1 {v13.2s},[x8],x15
+ umlsl v20.8h, v15.8b, v29.8b
+
+ ld1 {v14.2s},[x8],#8
+ ld1 {v15.2s},[x8],x15
+ umlsl v20.8h, v19.8b, v31.8b
+
+ ld1 {v16.2s},[x8],#8
+ ld1 {v17.2s},[x8],x15
+ umull v10.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+
+ ld1 {v18.2s},[x8],#8
+ ld1 {v19.2s},[x8],x15
+ umlal v10.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+
+ add x8, x8,#8
+ umlsl v10.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ add x20,x16, x2, lsl #2
+ prfm PLDL1KEEP,[x20]
+ add x20,x8, x2, lsl #2
+ prfm PLDL1KEEP,[x20]
+ st1 {v8.16b},[x1],#16 //store the result pu1_dst
+ umlsl v10.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+
+ add x20,x16,x13 //increment the src pointer by 2*src_strd-wd
+ csel x16, x20, x16,eq
+ umlal v10.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+
+ add x20,x16,x2 //pu1_src + src_strd
+ csel x8, x20, x8,eq
+ umlsl v10.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+
+// and x11, x16, #31
+ umlal v10.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+
+ sub x20,x19,#2
+ csel x19, x20, x19,eq
+ umlsl v10.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+
+ //cmp x11, x0
+ umull v22.8h, v3.8b, v25.8b
+
+// add x20,x16, x2, lsl #2
+ prfm PLDL1KEEP,[x20]
+ umlsl v22.8h, v1.8b, v24.8b
+
+ st1 {v20.8h},[x1],#16
+ umlal v22.8h, v7.8b, v27.8b
+
+// add x20,x8, x2, lsl #2
+ prfm PLDL1KEEP,[x20]
+ umlsl v22.8h, v5.8b, v26.8b
+
+// mov x0, x11
+ umlal v22.8h, v13.8b, v28.8b
+
+ cmp x19,#0
+ umlal v22.8h, v17.8b, v30.8b
+
+ st1 {v10.8h},[x10],#16
+ umlsl v22.8h, v15.8b, v29.8b
+
+ umlsl v22.8h, v19.8b, v31.8b
+
+ beq epilog_16
+
+ ld1 {v0.2s},[x16],#8 //vector load pu1_src
+ ld1 {v1.2s},[x16],x15 //vector load pu1_src
+ ld1 {v2.2s},[x16],#8
+ ld1 {v3.2s},[x16],x15
+ ld1 {v4.2s},[x16],#8
+ ld1 {v5.2s},[x16],x15
+ ld1 {v6.2s},[x16],#8
+ ld1 {v7.2s},[x16],x15
+ ld1 {v12.2s},[x16],#8
+ ld1 {v13.2s},[x16],x15
+ umull v8.8h, v2.8b, v25.8b //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
+ ld1 {v14.2s},[x16],#8
+ ld1 {v15.2s},[x16],x15
+ umlal v8.8h, v6.8b, v27.8b //mul_res = vmull_u8(src[0_3], coeffabs_3)//
+ ld1 {v16.2s},[x16],#8
+ ld1 {v17.2s},[x16],x15
+ umlsl v8.8h, v0.8b, v24.8b //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
+ ld1 {v18.2s},[x16],#8
+ ld1 {v19.2s},[x16],x15
+ umlsl v8.8h, v4.8b, v26.8b //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
+ umlal v8.8h, v12.8b, v28.8b //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
+ cmp x9,#0
+ umlsl v8.8h, v14.8b, v29.8b //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
+ mov x20,x14
+ csel x9, x20, x9,eq
+ umlal v8.8h, v16.8b, v30.8b //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
+ st1 {v22.16b},[x10],#16 //store the result pu1_dst
+ umlsl v8.8h, v18.8b, v31.8b //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
+ add x20,x10,x12,lsl #1
+ csel x1, x20, x1,eq
+ add x20,x1,x3,lsl #1 //pu1_dst + dst_strd
+ csel x10, x20, x10,eq
+ b inner_loop_16
+
+
+epilog_16:
+// vqrshrun.s16 d11,q11,#6
+ st1 {v22.16b},[x10],#16 //store the result pu1_dst
+
+ ldp x0,x11,[sp],#16
+ mov x14,x6
+ cmp x14,#24
+ beq outer_loop8_residual
+ add x1,x10,x12,lsl #1
+ mov x11,x5 //loads ht
+ and x11,x11,#1
+ cmp x11,#1
+ beq height_residue_4
+
+end_loops1:
+
+ // ldmfd sp!,{x8-x16,pc} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
new file mode 100644
index 0000000..b94ec3c
--- /dev/null
+++ b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
@@ -0,0 +1,418 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//******************************************************************************
+//* //file
+//* ihevc_inter_pred_filters_luma_vert_w16inp.s
+//*
+//* //brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* //author
+//* yogeswaran rs
+//*
+//* //par list of functions:
+//*
+//* - ihevc_inter_pred_luma_vert()
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
+///* include reconstruction */
+//
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* luma vertical filter for 16bit input.
+//*
+//* //par description:
+//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+//* the elements pointed by 'pu1_src' and writes to the location pointed by
+//* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and
+//* clipped to lie between 0 and 255 assumptions : the function is
+//* optimized considering the fact width is multiple of 4. and height as
+//* multiple of 2.
+//*
+//* //param[in] pi2_src
+//* word16 pointer to the source
+//*
+//* //param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] dst_strd
+//* integer destination stride
+//*
+//* //param[in] pi1_coeff
+//* word8 pointer to the filter coefficients
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word8 *pi1_coeff,
+// word32 ht,
+// word32 wd )
+//**************variables vs registers*****************************************
+// r0 => *pu2_src
+// r1 => *pu1_dst
+// r2 => src_strd
+// r3 => dst_strd
+// r4 => *pi1_coeff
+// r5 => ht
+// r6 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8
+
+.type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function
+
+ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
+
+ //stmfd sp!, {r4-r12, r14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19,x20,[sp, #-16]!
+
+ mov x15,x4 // pi1_coeff
+ mov x16,x5 // ht
+ mov x17,x6 // wd
+
+
+ mov x12,x15 //load pi1_coeff
+ lsl x6,x3,#1
+ mov x5,x17 //load wd
+ ld1 {v0.8b},[x12] //coeff = ld1_s8(pi1_coeff)
+ lsl x2, x2,#1
+ sub x12,x2,x2,lsl #2 //src_ctrd & pi1_coeff
+ //vabs.s8 d0,d0 //vabs_s8(coeff)
+ add x0,x0,x12 //r0->pu1_src r12->pi1_coeff
+ mov x3,x16 //load ht
+ subs x7,x3,#0 //r3->ht
+ //ble end_loops //end loop jump
+ sxtl v0.8h,v0.8b
+ dup v22.4h,v0.h[0] //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
+ dup v23.4h,v0.h[1] //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
+ dup v24.4h,v0.h[2] //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
+ dup v25.4h,v0.h[3] //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
+ dup v26.4h,v0.h[4] //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
+ dup v27.4h,v0.h[5] //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
+ dup v28.4h,v0.h[6] //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
+ dup v29.4h,v0.h[7] //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
+ movi v30.4s,#8, lsl #16
+
+ sub x9,x5,x6,lsl #2 //r6->dst_strd r5 ->wd
+ neg x9,x9
+ sub x8,x5,x2,lsl #2 //r2->src_strd
+ neg x8,x8
+ sub x8,x8,x5
+ sub x9,x9,x5
+ lsr x3, x5, #2 //divide by 4
+ mul x7, x7, x3 //multiply height by width
+ sub x7, x7, #4 //subtract by one for epilog
+ mov x4,x5 //r5 ->wd
+ //mov r2, r2, lsl #1
+
+prolog:
+
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+ ld1 {v0.4h},[x0], #8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+ subs x4,x4,#4
+ ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+ ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
+ smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+ smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+ smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
+ smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+
+ ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+
+ smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+ add x20,x0,x8,lsl #0
+ csel x0,x20,x0,le
+ smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ csel x4,x5,x4,le
+ smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+ smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+ smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ sub v8.4s, v8.4s, v30.4s
+
+ ld1 {v1.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smull v12.4s,v3.4h,v23.4h
+ ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+ smlal v12.4s,v2.4h,v22.4h
+ ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smlal v12.4s,v4.4h,v24.4h
+ smlal v12.4s,v5.4h,v25.4h
+ smlal v12.4s,v6.4h,v26.4h
+ smlal v12.4s,v7.4h,v27.4h
+ smlal v12.4s,v16.4h,v28.4h
+ smlal v12.4s,v17.4h,v29.4h
+ add x14,x1,x6
+ sub v10.4s, v10.4s, v30.4s
+ shrn v8.4h, v8.4s, #6
+ //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ smull v14.4s,v4.4h,v23.4h
+ smlal v14.4s,v3.4h,v22.4h
+ smlal v14.4s,v5.4h,v24.4h
+ smlal v14.4s,v6.4h,v25.4h
+ ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v7.4h,v26.4h
+ ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v16.4h,v27.4h
+ ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v17.4h,v28.4h
+ ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v18.4h,v29.4h
+ ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
+
+ st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
+ sub v12.4s, v12.4s, v30.4s
+ shrn v10.4h, v10.4s, #6
+ //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ add x20, x1, x9
+ csel x1, x20, x1, le
+
+ subs x7,x7,#4
+
+
+ blt epilog_end //jumps to epilog_end
+ beq epilog //jumps to epilog
+
+kernel_8:
+
+ smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+ subs x4,x4,#4
+ smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ add x20,x0,x8,lsl #0
+ csel x0,x20,x0,le
+ smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v10.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
+
+ sub v14.4S, v14.4s, v30.4s
+ shrn v12.4h, v12.4s, #6
+ //vqrshrun d12,q6,#6
+ ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+
+ smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+ smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ st1 {v12.2s},[x14],x6
+
+ smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+
+ smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+
+ sub v8.4s, v8.4s, v30.4s
+ shrn v14.4h, v14.4s, #6
+ //vqrshrun d14,q7,#6
+
+ smull v12.4s,v3.4h,v23.4h
+ csel x4,x5,x4,le
+
+ smlal v12.4s,v2.4h,v22.4h
+ ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+
+ smlal v12.4s,v4.4h,v24.4h
+ add x3,x0,x2 //pu1_src_tmp += src_strd//
+
+ smlal v12.4s,v5.4h,v25.4h
+
+ smlal v12.4s,v6.4h,v26.4h
+ st1 {v14.2s},[x14],x6
+
+ smlal v12.4s,v7.4h,v27.4h
+ ld1 {v1.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+
+ smlal v12.4s,v16.4h,v28.4h
+ add x14,x1,x6
+
+ smlal v12.4s,v17.4h,v29.4h
+ ld1 {v0.4h},[x0],#8 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+
+ sub v10.4s, v10.4s, v30.4s
+ shrn v8.4h, v8.4s, #6
+ //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ ld1 {v2.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+
+ smull v14.4s,v4.4h,v23.4h
+ smlal v14.4s,v3.4h,v22.4h
+ smlal v14.4s,v5.4h,v24.4h
+ ld1 {v3.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
+
+ smlal v14.4s,v6.4h,v25.4h
+ ld1 {v4.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v7.4h,v26.4h
+ ld1 {v5.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v16.4h,v27.4h
+ ld1 {v6.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v17.4h,v28.4h
+ ld1 {v7.4h},[x3],x2 //src_tmp4 = ld1_u8(pu1_src_tmp)//
+ smlal v14.4s,v18.4h,v29.4h
+ st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
+
+ sub v12.4s, v12.4s, v30.4s
+ shrn v10.4h, v10.4s, #6
+ add x20, x1, x9
+ csel x1, x20, x1, le
+
+ //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+ subs x7,x7,#4
+
+ bgt kernel_8 //jumps to kernel_8
+
+epilog:
+
+ smull v8.4s,v1.4h,v23.4h //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
+ smlal v8.4s,v0.4h,v22.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
+ smlal v8.4s,v2.4h,v24.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
+ smlal v8.4s,v3.4h,v25.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
+ smlal v8.4s,v4.4h,v26.4h //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
+ smlal v8.4s,v5.4h,v27.4h //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
+ smlal v8.4s,v6.4h,v28.4h //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
+ smlal v8.4s,v7.4h,v29.4h //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
+ st1 {v10.2s},[x14],x6
+
+ sub v14.4s, v14.4s, v30.4s
+ shrn v12.4h, v12.4s, #6
+ //vqrshrun d12,q6,#6
+
+ ld1 {v16.4h},[x3],x2 //src_tmp1 = ld1_u8(pu1_src_tmp)//
+ smull v10.4s,v2.4h,v23.4h //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
+ smlal v10.4s,v1.4h,v22.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
+ smlal v10.4s,v3.4h,v24.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
+ smlal v10.4s,v4.4h,v25.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
+ smlal v10.4s,v5.4h,v26.4h //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
+ smlal v10.4s,v6.4h,v27.4h //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
+ smlal v10.4s,v7.4h,v28.4h //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
+ smlal v10.4s,v16.4h,v29.4h //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
+ st1 {v12.2s},[x14],x6
+
+ sub v8.4s, v8.4s, v30.4s
+ shrn v14.4h, v14.4s, #6
+ //vqrshrun d14,q7,#6
+
+ ld1 {v17.4h},[x3],x2 //src_tmp2 = ld1_u8(pu1_src_tmp)//
+ smull v12.4s,v3.4h,v23.4h
+ smlal v12.4s,v2.4h,v22.4h
+ smlal v12.4s,v4.4h,v24.4h
+ smlal v12.4s,v5.4h,v25.4h
+ smlal v12.4s,v6.4h,v26.4h
+ smlal v12.4s,v7.4h,v27.4h
+ smlal v12.4s,v16.4h,v28.4h
+ smlal v12.4s,v17.4h,v29.4h
+ st1 {v14.2s},[x14],x6
+ sub v10.4s, v10.4s, v30.4s
+ shrn v8.4h, v8.4s, #6
+ //vqrshrun d8,q4,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ ld1 {v18.4h},[x3],x2 //src_tmp3 = ld1_u8(pu1_src_tmp)//
+ smull v14.4s,v4.4h,v23.4h
+ smlal v14.4s,v3.4h,v22.4h
+ smlal v14.4s,v5.4h,v24.4h
+ smlal v14.4s,v6.4h,v25.4h
+ smlal v14.4s,v7.4h,v26.4h
+ smlal v14.4s,v16.4h,v27.4h
+ smlal v14.4s,v17.4h,v28.4h
+ smlal v14.4s,v18.4h,v29.4h
+ sub v12.4s, v12.4s, v30.4s
+ shrn v10.4h, v10.4s, #6
+ //vqrshrun d10,q5,#6 //sto_res = vqmovun_s16(sto_res_tmp)//
+
+ add x14,x1,x6
+ st1 {v8.2s},[x1],#8 //st1_u8(pu1_dst,sto_res)//
+
+epilog_end:
+ st1 {v10.2s},[x14],x6 //st1_u8(pu1_dst_tmp,sto_res)//
+ shrn v12.4h, v12.4s, #6
+ //vqrshrun d12,q6,#6
+
+ st1 {v12.2s},[x14],x6
+ sub v14.4s, v14.4s, v30.4s
+ shrn v14.4h, v14.4s, #6
+ //vqrshrun d14,q7,#6
+
+ st1 {v14.2s},[x14],x6
+
+
+end_loops:
+
+ //ldmfd sp!,{r4-r12,r15} //reload the registers from sp
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_dc.s b/common/arm64/ihevc_intra_pred_chroma_dc.s
new file mode 100644
index 0000000..2fdee98
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_dc.s
@@ -0,0 +1,300 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_chroma_dc_neon.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_dc_av8
+
+.type ihevc_intra_pred_chroma_dc_av8, %function
+
+ihevc_intra_pred_chroma_dc_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x9, #0
+ mov v17.s[0], w9
+ mov v17.s[1], w9
+
+ clz w5,w4 //counts leading zeros
+
+ add x6, x0, x4,lsl #1 //&src[2nt]
+ mov v18.s[0], w9
+ mov v18.s[1], w9
+ sub x20, x5, #32 //log2nt
+ neg x5, x20
+ add x7, x0, x4, lsl #2 //&src[4nt]
+ mov x12,x5
+ add x8, x7, #2 //&src[4nt+2]
+
+ cmp x4, #4
+ beq dc_4 //nt=4 loop
+
+
+add_loop:
+ ld2 {v30.8b, v31.8b}, [x6], #16 //load from src[nt]
+ lsl x10,x4,#1 //2nt
+
+ uaddlp v2.4h, v30.8b
+ subs x10, x10,#0x10
+
+ ld2 {v26.8b, v27.8b}, [x8],#16 //load from src[2nt+1]
+
+ uaddlp v3.4h, v31.8b
+ uaddlp v2.2s, v2.4h
+ uaddlp v3.2s, v3.4h
+
+ uadalp v17.1d, v2.2s
+
+ uadalp v18.1d, v3.2s
+
+ uaddlp v2.4h, v26.8b
+ uaddlp v3.4h, v27.8b
+
+ uaddlp v2.2s, v2.4h
+ uaddlp v3.2s, v3.4h
+
+ uadalp v17.1d, v2.2s
+ uadalp v18.1d, v3.2s
+
+ beq epil_add_loop
+
+core_loop_add:
+ ld2 {v30.8b, v31.8b}, [x6],#16 //load from src[nt]
+ uaddlp v28.4h, v30.8b
+ uaddlp v3.4h, v31.8b
+
+ ld2 {v26.8b, v27.8b}, [x8],#16 //load from src[2nt+1]
+
+ uaddlp v3.2s, v3.4h
+ uaddlp v29.2s, v28.4h
+
+ uadalp v18.1d, v3.2s
+ uadalp v17.1d, v29.2s
+
+ uaddlp v3.4h, v27.8b
+ uaddlp v28.4h, v26.8b
+
+ uaddlp v3.2s, v3.4h
+ uaddlp v29.2s, v28.4h
+
+ uadalp v18.1d, v3.2s
+ uadalp v17.1d, v29.2s
+
+
+epil_add_loop:
+
+ smov x1, v18.2s[0]
+ smov x11, v17.2s[0]
+
+ add x1,x1,x4
+ add x11,x11,x4
+
+ lsr x1,x1,x12
+ lsr x11,x11,x12
+
+ dup v17.8b,w1
+ dup v16.8b,w11
+
+prologue_cpy_32:
+
+ add x5, x2, x3
+ subs x9, x4, #8
+ lsl x6, x3, #2
+ csel x11, x6, x11,eq
+ add x8, x5, x3
+ add x10, x8, x3
+
+ beq epilogue_copy
+
+ st2 {v16.8b, v17.8b}, [x2],#16
+ add x6, x6, #-16
+
+ st2 {v16.8b, v17.8b}, [x5],#16
+ st2 {v16.8b, v17.8b}, [x8],#16
+ mov x20,#16
+ csel x11, x20, x11,ne
+ st2 {v16.8b, v17.8b}, [x10],#16
+
+
+ st2 {v16.8b, v17.8b}, [x2], x6
+ st2 {v16.8b, v17.8b}, [x5], x6
+ st2 {v16.8b, v17.8b}, [x8], x6
+ st2 {v16.8b, v17.8b}, [x10], x6
+
+kernel_copy:
+ st2 {v16.8b, v17.8b}, [x2],#16
+ st2 {v16.8b, v17.8b}, [x5],#16
+ st2 {v16.8b, v17.8b}, [x8],#16
+ st2 {v16.8b, v17.8b}, [x10],#16
+
+ st2 {v16.8b, v17.8b}, [x2], x6
+ st2 {v16.8b, v17.8b}, [x5], x6
+ st2 {v16.8b, v17.8b}, [x8], x6
+ st2 {v16.8b, v17.8b}, [x10], x6
+
+ st2 {v16.8b, v17.8b}, [x2],#16
+ st2 {v16.8b, v17.8b}, [x5],#16
+ st2 {v16.8b, v17.8b}, [x8],#16
+ st2 {v16.8b, v17.8b}, [x10],#16
+
+ st2 {v16.8b, v17.8b}, [x2], x6
+ st2 {v16.8b, v17.8b}, [x5], x6
+ st2 {v16.8b, v17.8b}, [x8], x6
+ st2 {v16.8b, v17.8b}, [x10], x6
+
+epilogue_copy:
+ st2 {v16.8b, v17.8b}, [x2],x11
+ st2 {v16.8b, v17.8b}, [x5],x11
+ st2 {v16.8b, v17.8b}, [x8],x11
+ st2 {v16.8b, v17.8b}, [x10],x11
+
+ st2 {v16.8b, v17.8b}, [x2]
+ st2 {v16.8b, v17.8b}, [x5]
+ st2 {v16.8b, v17.8b}, [x8]
+ st2 {v16.8b, v17.8b}, [x10]
+ b end_func
+
+dc_4:
+ ld2 {v30.8b, v31.8b},[x6] //load from src[nt]
+ shl d3, d30,#32
+
+ ld2 {v26.8b, v27.8b},[x8] //load from src[2nt+1]
+ shl d2, d31,#32
+
+ uaddlp v3.4h, v3.8b
+ uaddlp v2.4h, v2.8b
+ uaddlp v3.2s, v3.4h
+ uaddlp v2.2s, v2.4h
+ uadalp v17.1d, v3.2s
+ uadalp v18.1d, v2.2s
+
+ shl d3, d26,#32
+ shl d2, d27,#32
+ uaddlp v3.4h, v3.8b
+ uaddlp v2.4h, v2.8b
+ uaddlp v3.2s, v3.4h
+ uaddlp v2.2s, v2.4h
+ uadalp v17.1d, v3.2s
+ uadalp v18.1d, v2.2s
+
+ smov x10, v17.2s[0]
+ smov x11, v18.2s[0]
+
+ add x10,x10,x4
+ add x11,x11,x4
+ lsr x10,x10,x12
+ lsr x11,x11,x12
+ orr x10,x10,x11,lsl #8
+ dup v0.4h,w10
+
+ st1 {v0.8b},[x2],x3
+ st1 {v0.8b},[x2],x3
+ st1 {v0.8b},[x2],x3
+ st1 {v0.8b},[x2]
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_horz.s b/common/arm64/ihevc_intra_pred_chroma_horz.s
new file mode 100644
index 0000000..da41e59
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_horz.s
@@ -0,0 +1,361 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_chroma_horz_neon.s
+//*
+//* @brief
+//* contains function definition for intra prediction interpolation filters
+//*
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//* - ihevc_intra_pred_luma_horz()
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* intra prediction interpolation filter for horizontal luma variable.
+//*
+//* @par description:
+//* horizontal intraprediction(mode 10) with.extern samples location
+//* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer
+//* to section 8.4.4.2.6 in the standard (special case)
+//*
+//* @param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* integer transform block size
+//*
+//* @param[in] mode
+//* integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_horz_av8
+
+.type ihevc_intra_pred_chroma_horz_av8, %function
+
+ihevc_intra_pred_chroma_horz_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ lsl x6,x4,#2 //four_nt
+
+ add x12,x0,x6 //*pu1_ref[four_nt]
+ cmp x4,#4 //if nt == 4
+ beq core_loop_4
+
+ cmp x4,#8 //if nt == 8
+ beq core_loop_8
+
+ //cmp x4,#16 @if nt == 16
+ //beq core_loop_16
+
+ sub x12,x12,#16 //move to 16th value pointer
+ add x9,x2,#16
+
+core_loop_16:
+ ld1 { v0.8h},[x12] //load 16 values. d1[7] will have the 1st value.
+ sub x12,x12,#16
+ ld1 { v10.8h},[x12] //load 16 values. d1[7] will have the 1st value.
+
+ dup v2.8h, v0.4h[7] //duplicate the i value.
+
+ dup v4.8h, v0.4h[6] //duplicate the ii value.
+ dup v6.8h, v0.4h[5] //duplicate the iii value.
+ st1 { v2.8h},[x2],x3 //store in 1st row 0-16 columns
+ st1 { v2.8h},[x9],x3 //store in 1st row 16-32 columns
+
+ dup v8.8h, v0.4h[4]
+ st1 { v4.8h},[x2],x3
+ st1 { v4.8h},[x9],x3
+
+ dup v2.8h, v0.4h[3]
+ st1 { v6.8h},[x2],x3
+ st1 { v6.8h},[x9],x3
+
+ dup v4.8h, v0.4h[2]
+ st1 { v8.8h},[x2],x3
+ st1 { v8.8h},[x9],x3
+
+ dup v6.8h, v0.4h[1]
+ st1 { v2.8h},[x2],x3
+ st1 { v2.8h},[x9],x3
+
+ dup v8.8h, v0.4h[0]
+ st1 { v4.8h},[x2],x3
+ st1 { v4.8h},[x9],x3
+
+ dup v2.8h, v10.4h[7]
+ st1 { v6.8h},[x2],x3
+ st1 { v6.8h},[x9],x3
+
+ dup v4.8h, v10.4h[6]
+ st1 { v8.8h},[x2],x3
+ st1 { v8.8h},[x9],x3
+
+ dup v6.8h, v10.4h[5]
+ st1 { v2.8h},[x2],x3
+ st1 { v2.8h},[x9],x3
+
+ dup v8.8h, v10.4h[4]
+ st1 { v4.8h},[x2],x3
+ st1 { v4.8h},[x9],x3
+
+ dup v2.8h, v10.4h[3]
+ st1 { v6.8h},[x2],x3
+ st1 { v6.8h},[x9],x3
+
+ dup v4.8h, v10.4h[2]
+ st1 { v8.8h},[x2],x3
+ st1 { v8.8h},[x9],x3
+
+ dup v6.8h, v10.4h[1]
+ st1 { v2.8h},[x2],x3
+ st1 { v2.8h},[x9],x3
+ sub x12,x12,#16 //move to 16th value pointer
+
+ dup v8.8h, v10.4h[0]
+ st1 { v4.8h},[x2],x3
+ st1 { v4.8h},[x9],x3
+
+ subs x4,x4,#16 //decrement the loop count by 16
+ st1 { v6.8h},[x2],x3
+ st1 { v6.8h},[x9],x3
+
+ st1 { v8.8h},[x2],x3
+ st1 { v8.8h},[x9],x3
+ bgt core_loop_16
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+ b endloop
+
+core_loop_8:
+ ldrb w14,[x12],#1 //pu1_ref[two_nt]
+ sxtw x14,w14
+ //vld1.8 {q15},[x12] @pu1_ref[two_nt + 1 + col]
+
+ dup v28.8b,w14
+ sub x12,x12,#17
+ ld1 { v0.16b},[x12]
+
+ sub x12,x12,#16
+// ld1 { v30.16b},[x12]
+ dup v10.8h, v0.4h[7]
+ //vmovl.u8 q13,d26
+
+ dup v2.8h, v0.4h[6]
+ //vsubl.u8 q12,d30,d28
+
+ dup v4.8h, v0.4h[5]
+ //vshr.s16 q12,q12,#1
+
+ dup v6.8h, v0.4h[4]
+ //vqadd.s16 q11,q13,q12
+
+ dup v8.8h, v0.4h[3]
+ //vqmovun.s16 d22,q11
+
+ st1 { v10.8h},[x2],x3
+
+ dup v10.8h, v0.4h[2]
+ //vsubl.u8 q12,d31,d28
+
+ dup v12.8h, v0.4h[1]
+ //vshr.s16 q12,q12,#1
+
+ dup v14.8h, v0.4h[0]
+ //vqadd.s16 q11,q13,q12
+
+ dup v16.8h, v0.4h[3]
+ //vqmovun.s16 d22,q11
+
+ st1 { v2.8h},[x2],x3
+ //sub x2,x2,#8
+
+ st1 { v4.8h},[x2],x3
+
+ st1 { v6.8h},[x2],x3
+ st1 { v8.8h},[x2],x3
+ st1 { v10.8h},[x2],x3
+
+ //vdup.8 q1,d0[2]
+ st1 { v12.8h},[x2],x3
+
+ //vdup.8 q2,d0[1]
+ st1 { v14.8h},[x2],x3
+
+ //vdup.8 q3,d0[0]
+ //vst1.8 {q7},[x2],x3
+
+ //vdup.8 q4,d0[3]
+ //vst1.8 {q8},[x2],x3
+
+ //vdup.8 q5,d0[2]
+ //vst1.8 {q1},[x2],x3
+
+ //vdup.8 q6,d0[1]
+ //vst1.8 {q2},[x2],x3
+
+ //vdup.8 q7,d0[0]
+ //vst1.8 {q3},[x2],x3
+
+ //vst1.8 {q4},[x2],x3
+ //vst1.8 {q5},[x2],x3
+ //vst1.8 {q6},[x2],x3
+ //vst1.8 {q7},[x2],x3
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+ b endloop
+
+
+core_loop_4:
+ ldrb w14,[x12] //pu1_ref[two_nt]
+ sxtw x14,w14
+ add x12,x12,#1 //pu1_ref[two_nt + 1]
+ //vld1.8 {d30},[x12] @pu1_ref[two_nt + 1 + col]
+
+ sub x12,x12,#9
+ ld1 {v0.8b},[x12]
+ sub x12,x12,#8
+ ld1 {v30.8b},[x12]
+ dup v26.4h, v0.4h[3]
+ dup v28.8b,w14
+
+ dup v3.4h, v0.4h[2]
+ uxtl v26.8h, v26.8b
+
+ dup v4.4h, v0.4h[1]
+ usubl v24.8h, v30.8b, v28.8b
+
+ dup v5.4h, v0.4h[0]
+ sshr v24.8h, v24.8h,#1
+
+ dup v6.4h, v0.4h[3]
+ sqadd v22.8h, v26.8h , v24.8h
+
+ dup v7.4h, v0.4h[2]
+ sqxtun v22.8b, v22.8h
+
+ st1 {v6.8b},[x2],x3
+ st1 {v3.8b},[x2],x3
+
+ dup v8.4h, v0.4h[1]
+ st1 {v4.8b},[x2],x3
+ st1 {v5.8b},[x2],x3
+
+ dup v9.4h, v0.4h[0]
+ //vst1.8 {d6},[x2],x3
+ //vst1.8 {d7},[x2],x3
+
+ //vst1.8 {d8},[x2],x3
+ //vst1.8 {d9},[x2],x3
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+ b endloop
+
+
+//core_loop_4
+ ldrb w14,[x12] //pu1_ref[two_nt]
+ sxtw x14,w14
+ add x12,x12,#1 //pu1_ref[two_nt + 1]
+ ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col]
+
+ sub x12,x12,#5
+ ld1 {v0.8b},[x12]
+ dup v28.8b,w14
+ dup v26.8b, v0.8b[3]
+ uxtl v26.8h, v26.8b
+
+ dup v3.8b, v0.8b[2]
+ usubl v24.8h, v30.8b, v28.8b
+
+ dup v4.8b, v0.8b[1]
+ sshr v24.8h, v24.8h,#1
+
+ dup v5.8b, v0.8b[0]
+ sqadd v22.8h, v26.8h , v24.8h
+
+ sqxtun v22.8b, v22.8h
+
+ st1 {v22.s}[0],[x2],x3
+ st1 {v3.s}[0],[x2],x3
+ st1 {v4.s}[0],[x2],x3
+ st1 {v5.s}[0],[x2],x3
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+endloop:
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode2.s b/common/arm64/ihevc_intra_pred_chroma_mode2.s
new file mode 100644
index 0000000..d2c0730
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode2.s
@@ -0,0 +1,312 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode2_neon.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode2_av8
+
+.type ihevc_intra_pred_chroma_mode2_av8, %function
+
+ihevc_intra_pred_chroma_mode2_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x8,#-4
+
+ cmp x4,#4
+ beq mode2_4
+
+ add x0,x0,x4,lsl #2
+
+ sub x0,x0,#0x12 //src[1]
+ add x10,x0,#-2
+
+prologue_cpy_32:
+
+ ld2 {v0.8b, v1.8b},[x0],x8
+
+ mov x11,x4
+ rev64 v16.8b, v0.8b
+ rev64 v17.8b, v1.8b
+
+ ld2 {v2.8b, v3.8b},[x10],x8
+ mov x6, x2
+
+ ld2 {v4.8b, v5.8b},[x0],x8
+ ld2 {v6.8b, v7.8b},[x10],x8
+ lsr x1, x4, #3
+
+ ld2 {v8.8b, v9.8b},[x0],x8
+ ld2 {v10.8b, v11.8b},[x10],x8
+ ld2 {v12.8b, v13.8b},[x0],x8
+ mul x1, x4, x1
+
+ ld2 {v14.8b, v15.8b},[x10],x8
+ add x7,x6,x3
+
+ rev64 v18.8b, v2.8b
+ rev64 v19.8b, v3.8b
+ lsl x5, x3, #2
+
+ rev64 v20.8b, v4.8b
+ rev64 v21.8b, v5.8b
+ add x9,x7,x3
+
+ rev64 v22.8b, v6.8b
+ rev64 v23.8b, v7.8b
+
+ rev64 v24.8b, v8.8b
+ rev64 v25.8b, v9.8b
+
+ rev64 v26.8b, v10.8b
+ subs x1,x1,#8
+
+ rev64 v27.8b, v11.8b
+
+ rev64 v28.8b, v12.8b
+ rev64 v29.8b, v13.8b
+
+ rev64 v30.8b, v14.8b
+ add x14,x9,x3
+ rev64 v31.8b, v15.8b
+
+ beq epilogue_mode2
+
+ sub x12,x4,#8
+
+kernel_mode2:
+
+ st2 {v16.8b, v17.8b},[x6],x5
+ st2 {v18.8b, v19.8b},[x7],x5
+ subs x11,x11,#8
+ st2 {v20.8b, v21.8b},[x9],x5
+ st2 {v22.8b, v23.8b},[x14],x5
+ st2 {v24.8b, v25.8b},[x6],x5
+ add x20,x2,#16
+ csel x2, x20, x2,gt
+ st2 {v26.8b, v27.8b},[x7],x5
+ st2 {v28.8b, v29.8b},[x9],x5
+ st2 {v30.8b, v31.8b},[x14],x5
+
+ ld2 {v0.8b, v1.8b},[x0],x8
+ csel x11, x4, x11,le
+
+ ld2 {v2.8b, v3.8b},[x10],x8
+ ld2 {v4.8b, v5.8b},[x0],x8
+ add x20, x2, x3, lsl #2
+ csel x2, x20, x2,le
+ ld2 {v6.8b, v7.8b},[x10],x8
+ rev64 v16.8b, v0.8b
+
+ ld2 {v8.8b, v9.8b},[x0],x8
+ ld2 {v10.8b, v11.8b},[x10],x8
+ sub x20, x6,#16
+ csel x2, x20, x2,le
+ ld2 {v12.8b, v13.8b},[x0],x8
+ rev64 v17.8b, v1.8b
+ ld2 {v14.8b, v15.8b},[x10],x8
+
+ subs x12,x12,#8
+ mov x6, x2
+ add x20, x0, x4,lsl #1
+ csel x0, x20, x0,le
+ add x7, x6, x3
+
+ rev64 v18.8b, v2.8b
+ sub x20, x0, #16
+ csel x0, x20, x0,le
+ rev64 v19.8b, v3.8b
+
+ rev64 v20.8b, v4.8b
+ csel x12, x4, x12,le
+ rev64 v21.8b, v5.8b
+
+ rev64 v22.8b, v6.8b
+ add x9, x7, x3
+ rev64 v23.8b, v7.8b
+
+ rev64 v24.8b, v8.8b
+ add x10,x0,#-2
+ rev64 v25.8b, v9.8b
+
+ rev64 v26.8b, v10.8b
+ subs x1, x1, #8
+ rev64 v27.8b, v11.8b
+
+ rev64 v28.8b, v12.8b
+ rev64 v29.8b, v13.8b
+
+ rev64 v30.8b, v14.8b
+ add x14, x9, x3
+ rev64 v31.8b, v15.8b
+
+ bne kernel_mode2
+
+epilogue_mode2:
+
+ st2 {v16.8b, v17.8b},[x6],x5
+ st2 {v18.8b, v19.8b},[x7],x5
+ st2 {v20.8b, v21.8b},[x9],x5
+ st2 {v22.8b, v23.8b},[x14],x5
+ st2 {v24.8b, v25.8b},[x6],x5
+ st2 {v26.8b, v27.8b},[x7],x5
+ st2 {v28.8b, v29.8b},[x9],x5
+ st2 {v30.8b, v31.8b},[x14],x5
+
+ b end_func
+
+mode2_4:
+
+ lsl x12,x4,#1
+ add x0,x0,x12
+ sub x0,x0,#2
+
+ ld2 {v12.8b, v13.8b},[x0],x8
+ shl d0, d12,#32
+ add x10,x0,#2
+ shl d1, d13,#32
+
+ rev64 v0.8b, v0.8b
+ ld2 {v14.8b, v15.8b},[x10],x8
+ shl d2, d14,#32
+
+ rev64 v1.8b, v1.8b
+ shl d3, d15,#32
+ zip1 v0.8b, v0.8b, v1.8b
+ zip2 v1.8b, v0.8b, v1.8b
+ st1 {v0.8b},[x2],x3
+
+ rev64 v2.8b, v2.8b
+ ld2 {v16.8b, v17.8b},[x0],x8
+ shl d4, d16,#32
+ rev64 v3.8b, v3.8b
+ shl d5, d17,#32
+ zip1 v2.8b, v2.8b, v3.8b
+ zip2 v3.8b, v2.8b, v3.8b
+ rev64 v4.8b, v4.8b
+ rev64 v5.8b, v5.8b
+ st1 {v2.8b},[x2],x3
+
+
+ ld2 {v18.8b, v19.8b},[x10],x8
+ shl d6, d18,#32
+
+ zip1 v4.8b, v4.8b, v5.8b
+ zip2 v5.8b, v4.8b, v5.8b
+ shl d7, d19,#32
+ rev64 v6.8b, v6.8b
+ st1 {v4.8b},[x2],x3
+
+ rev64 v7.8b, v7.8b
+ zip1 v6.8b, v6.8b, v7.8b
+ zip2 v7.8b, v6.8b, v7.8b
+ st1 {v6.8b},[x2],x3
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
new file mode 100644
index 0000000..52fc702
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
@@ -0,0 +1,198 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode_18_34_neon.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_mode_18_34(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode_18_34_av8
+
+.type ihevc_intra_pred_chroma_mode_18_34_av8, %function
+
+ihevc_intra_pred_chroma_mode_18_34_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+
+ cmp x4,#4
+ beq mode2_4
+
+ mov x12,x4
+ mov x11,x4
+ add x0,x0,x4,lsl #2
+
+ cmp x5,#0x22
+ mov x10,x2
+
+ add x0,x0,#4
+
+ sub x20,x0,#4
+ csel x0, x20, x0,ne
+ mov x20,#2
+ csel x6, x20, x6,eq
+ mov x20,#-2
+ csel x6, x20, x6,ne
+ mov x8,x0
+
+
+kernel:
+
+
+ ld1 {v0.8b, v1.8b},[x8],x6
+ st1 {v0.8b, v1.8b},[x10],x3
+ ld1 {v2.8b, v3.8b},[x8],x6
+ st1 {v2.8b, v3.8b},[x10],x3
+ ld1 {v4.8b, v5.8b},[x8],x6
+ st1 {v4.8b, v5.8b},[x10],x3
+ ld1 {v6.8b, v7.8b},[x8],x6
+ st1 {v6.8b, v7.8b},[x10],x3
+ ld1 {v8.8b, v9.8b},[x8],x6
+ st1 {v8.8b, v9.8b},[x10],x3
+ ld1 {v10.8b, v11.8b},[x8],x6
+ st1 {v10.8b, v11.8b},[x10],x3
+ ld1 {v12.8b, v13.8b},[x8],x6
+ st1 {v12.8b, v13.8b},[x10],x3
+ ld1 {v14.8b, v15.8b},[x8],x6
+ st1 {v14.8b, v15.8b},[x10],x3
+
+ subs x12,x12,#8
+ bne kernel
+
+ cmp x11,#16
+ add x8,x0,#16
+ add x10,x2,#16
+ sub x11, x11,#16
+ mov x12,#16
+ beq kernel
+ b end_func
+
+mode2_4:
+
+ add x0,x0,#20
+ cmp x5,#0x22
+ sub x20,x0,#4
+ csel x0, x20, x0,ne
+
+ mov x20,#2
+ csel x8, x20, x8,eq
+ mov x20,#-2
+ csel x8, x20, x8,ne
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.2s},[x2],x3
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.2s},[x2],x3
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.2s},[x2],x3
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.2s},[x2],x3
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
new file mode 100644
index 0000000..1df4ad0
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
@@ -0,0 +1,551 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_chroma_mode_27_to_33.s
+//*
+//* @brief
+//* contains function definition for intra prediction interpolation filters
+//*
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//* - ihevc_intra_pred_chroma_mode_27_to_33()
+//*
+//* @remarksll
+//* none
+//*
+//*******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+//*.extern neighboring samples location pointed by 'pu1_ref' to the tu
+//* block location pointed by 'pu1_dst'
+//*
+//* @par description:
+//*
+//*
+//* @param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* @param[in] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* integer transform block size
+//*
+//* @param[in] mode
+//* integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//.if intra_pred_chroma_27_t0_33 == c
+//void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_mode_27_to_33_av8
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_chroma_mode_27_to_33_av8, %function
+
+ihevc_intra_pred_chroma_mode_27_to_33_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+ ldr x6, [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+ lsl x7,x4,#2 //four_nt
+
+ add x8,x6,x5,lsl #2 //*gai4_ihevc_ang_table[mode]
+ ldr w9, [x8] //intra_pred_ang = gai4_ihevc_ang_table[mode]
+ sxtw x9,w9
+ adrp x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+ ldr x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+ add x6,x1,#1
+
+ tst x4,#7
+ add x8,x0,x7 //pu1_ref + four_nt
+ mov x14,#0 //row
+ mov x12,x4
+ bne core_loop_4
+ lsl x4,x4,#1
+ b core_loop_8
+
+core_loop_8:
+ add x8,x8,#2 //pu1_ref_main_idx += (four_nt + 1)
+ dup v0.8b,w9 //intra_pred_ang
+ lsr x12, x4, #4 //divide by 8
+
+ movi v1.8b, #32
+ mul x7, x4, x12
+
+ movi v6.8h, #31
+
+ mov x1,x8
+ mov x5,x4
+ mov x11,#2
+
+prologue:
+ ld1 {v3.8b},[x6] //loads the row value
+ umull v2.8h, v3.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v5.8b, v2.8h,#5 //idx = pos >> 5
+
+ dup v31.8b, v4.8b[0]
+ add x0,x2,x3
+
+ smov x14, v5.2s[0] //(i row)extract idx to the r register
+ lsl x14,x14,#1
+
+ dup v29.8b, v4.8b[1] //(ii)
+ and x9,x14,#0xff //(i row) get the last byte
+
+ add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
+
+ asr x14,x14,#8 //(ii)shift by 8
+ ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ and x9,x14,#0xff //(ii)get the last byte
+
+ asr x14,x14,#8 //(iii)
+ ld1 {v9.8b},[x10] //(i row)ref_main_idx_1
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ and x9,x14,#0xff //(iii)
+ sub v30.8b, v1.8b , v31.8b //32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr x14,x14,#8 //(iv)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ and x9,x14,#0xff //(iv)
+
+ dup v25.8b, v4.8b[3] //(iv)
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+
+ dup v31.8b, v4.8b[4] //(v)
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ smov x14, v5.2s[1] //extract idx to the r register
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ lsl x14,x14,#1
+
+ st1 {v10.8b},[x2],#8 //(i row)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ and x9,x14,#0xff //(v)
+ dup v29.8b, v4.8b[5] //(vi)
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+
+ asr x14,x14,#8 //(vi)
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and x9,x14,#0xff //(vi)
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+ dup v27.8b, v4.8b[6] //(vii)
+ asr x14,x14,#8 //(vii)
+
+ and x9,x14,#0xff //(vii)
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ asr x14,x14,#8 //(viii)
+ dup v25.8b, v4.8b[7] //(viii)
+ and x9,x14,#0xff //(viii)
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs x7,x7,#8
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x20,x8,#8
+ csel x8, x20, x8,gt
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ sub x20,x4,#8
+ csel x4, x20, x4,gt
+
+ st1 {v10.8b},[x0],x3 //(v)
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ ld1 {v5.8b},[x6] //loads the row value
+ umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+ smov x14, v3.2s[0] //(i)extract idx to the r register
+ lsl x14,x14,#1
+ and x9,x14,#0xff //(i)
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ asr x14,x14,#8 //(ii)
+ dup v31.8b, v4.8b[0]
+ subs x4,x4,#8
+
+ ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+ and x9,x14,#0xff //(ii)
+ add x20,x6,#8 //increment the row value
+ csel x6, x20, x6,le
+
+ ld1 {v9.8b},[x10] //(i)ref_main_idx_1
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ ld1 {v5.8b},[x6] //loads the row value
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr x14,x14,#8 //(iii)
+
+ dup v29.8b, v4.8b[1] //(ii)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+ and x9,x14,#0xff //(iii)
+
+ st1 {v14.8b},[x0],x3 //(vi)
+ sub v30.8b, v1.8b , v31.8b //(i)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ asr x14,x14,#8 //(iv)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and x9,x14,#0xff //(iv)
+
+ smov x14, v3.2s[1] //extract idx to the r register
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ csel x4, x5, x4,le //reload nt
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v25.8b, v4.8b[3] //(iv)
+ umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+
+ st1 {v22.8b},[x0] //(viii)
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ lsl x14,x14,#1
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x0,x2,x3
+
+ dup v31.8b, v4.8b[4] //(v)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+ and x9,x14,#0xff //(v)
+
+ st1 {v10.8b},[x2],#8 //(i)
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+
+ dup v29.8b, v4.8b[5] //(vi)
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ asr x14,x14,#8 //(vi)
+
+ dup v27.8b, v4.8b[6] //(vii)
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and x9,x14,#0xff //(vi)
+
+ dup v25.8b, v4.8b[7] //(viii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ asr x14,x14,#8 //(vii)
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+ and x9,x14,#0xff //(vii)
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+ asr x14,x14,#8 //(viii)
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and x9,x14,#0xff //(viii)
+
+ smov x14, v3.2s[0] //(i)extract idx to the r register
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+ csel x8, x1, x8,le //reload the source to pu1_src+2nt
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x20,x8,#8 //increment the source next set 8 columns in same row
+ csel x8, x20, x8,gt
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+ lsl x20, x3,#3
+ csel x12,x20,x12,le
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ sub x20,x12,x5
+ csel x12, x20, x12,le
+
+ st1 {v10.8b},[x0],x3 //(v)
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x20,x2,x12 //increment the dst pointer to 8*dst_strd - nt
+ csel x2, x20, x2,le
+
+ xtn v4.8b, v4.8h
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+ lsl x14,x14,#1
+
+ and x9,x14,#0xff //(i)
+ subs x7,x7,#8
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ st1 {v14.8b},[x0],x3 //(vi)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ st1 {v22.8b},[x0],x3 //(viii)
+ b end_loops
+
+core_loop_4:
+ add x10,x8,#2 //pu1_ref_main_idx += (four_nt + 1)
+ add x11,x8,#4 //pu1_ref_main_idx_1 += (four_nt + 2)
+ mov x8,#0
+
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#2 //pu1_ref_main_idx += 2
+ csel x10, x20, x10,gt
+ add x11,x10,#2 //pu1_ref_main_idx_1 += 2
+ dup v0.8b,w5 //dup_const_fract
+ sub x20,x5,#32
+ neg x4, x20
+ dup v1.8b,w4 //dup_const_32_fract
+
+//inner_loop_4
+ ld1 {v2.8b},[x10] //ref_main_idx
+ add x8,x8,#1
+ mov x14,x5 //fract_prev = fract
+
+ ld1 {v3.8b},[x11] //ref_main_idx_1
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#2 //pu1_ref_main_idx += 1
+ csel x10, x20, x10,gt
+ add x11,x10,#2 //pu1_ref_main_idx_1 += 1
+
+ dup v6.8b,w5 //dup_const_fract
+ umull v4.8h, v2.8b, v1.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v7.8b,w4 //dup_const_32_fract
+ umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v8.8b},[x10] //ref_main_idx
+ add x8,x8,#1
+
+ ld1 {v9.8b},[x11] //ref_main_idx_1
+ rshrn v4.8b, v4.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ mov x14,x5 //fract_prev = fract
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#2 //pu1_ref_main_idx += 1
+ csel x10, x20, x10,gt
+ add x11,x10,#2 //pu1_ref_main_idx_1 += 1
+
+ dup v12.8b,w5 //dup_const_fract
+ umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v13.8b,w4 //dup_const_32_fract
+ umlal v10.8h, v9.8b, v6.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v14.8b},[x10] //ref_main_idx
+ add x8,x8,#1
+
+ st1 {v4.8b},[x2],x3
+ rshrn v10.8b, v10.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v15.8b},[x11] //ref_main_idx_1
+ mov x14,x5 //fract_prev = fract
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#2 //pu1_ref_main_idx += 1
+ csel x10, x20, x10,gt
+ add x11,x10,#2 //pu1_ref_main_idx_1 += 1
+
+ dup v18.8b,w5 //dup_const_fract
+ umull v16.8h, v14.8b, v13.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v19.8b,w4 //dup_const_32_fract
+ umlal v16.8h, v15.8b, v12.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v20.8b},[x10] //ref_main_idx
+
+ st1 {v10.8b},[x2],x3
+ rshrn v16.8b, v16.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+ ld1 {v21.8b},[x11] //ref_main_idx_1
+
+ umull v22.8h, v20.8b, v19.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+ umlal v22.8h, v21.8b, v18.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v16.8b},[x2],x3
+ rshrn v22.8b, v22.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ st1 {v22.8b},[x2],x3
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
new file mode 100644
index 0000000..3c8746c
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
@@ -0,0 +1,495 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_chroma_mode_3_to_9.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode_3_to_9_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_3_9
+
+.type ihevc_intra_pred_chroma_mode_3_to_9_av8, %function
+
+ihevc_intra_pred_chroma_mode_3_to_9_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x7, :got:gai4_ihevc_ang_table
+ ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+ adrp x8, :got:gai4_ihevc_inv_ang_table
+ ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+ add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode]
+ ldr w7, [x7] //intra_pred_ang
+ sxtw x7,w7
+ dup v30.8b,w7 //intra_pred_ang
+
+ adrp x14, :got:col_for_intra_chroma
+ ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
+
+prologue_8_16_32:
+ lsr x10, x4, #3
+ ld1 {v31.8b},[x14],#8
+ mul x10, x4, x10 //block counter (dec by #8)
+
+ lsl x11, x4, #1 //col counter to be inc/dec by #8
+ smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+
+ sub x7, x5, #3
+ adrp x12, :got:idx_neg_idx_chroma_3_9 //load most idx table
+ ldr x12, [x12, #:got_lo12:idx_neg_idx_chroma_3_9]
+
+ add x12, x12, x7, lsl #4
+ mov x8, x12
+
+ mov x7, #8
+ sub x7, x7, x3, lsl #3 //x7 = 8-8x3
+
+ ldr w9, [x8]
+ sxtw x9,w9
+ lsl x9, x9, #1
+ add x1, x0, x4, lsl #2 //pu1_ref + 4*nt
+
+ xtn v6.8b, v22.8h
+ dup v26.8b,w9 //most idx added to final idx values
+ sub x1, x1, #26 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+ sub x6, x1, x9
+
+ ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from most idx)
+ sshr v22.8h, v22.8h,#5
+
+ movi v29.8b, #31 //contains #31 for vand operation
+
+ movi v28.8b, #32
+
+ sqxtn v8.8b, v22.8h
+ shl v8.8b, v8.8b,#1 // 2 * idx
+
+ and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
+ movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
+
+ mov x0,#0x302 // idx value for v is +1 of u
+ dup v27.4h,w0
+ mov x0,#0
+
+ movi v9.8b, #22 //row 0 to 7
+
+ sub v8.8b, v8.8b , v27.8b //ref_main_idx (sub row)
+ sub v8.8b, v26.8b , v8.8b //ref_main_idx (row 0)
+ add v8.8b, v8.8b , v9.8b //to compensate the pu1_src idx incremented by 8
+ sub v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0)
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+ sub v4.8b, v8.8b , v29.8b //ref_main_idx (row 1)
+ sub v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1)
+
+ movi v29.8b, #4
+
+ tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+ sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 2)
+ sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
+ sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
+ sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+ sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
+ sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+
+ tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+ sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
+ sub v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ cmp x4,#4
+ beq end_func
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ umull v22.8h, v16.8b, v7.8b //mul (row 5)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
+ sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
+
+ st1 {v24.8b},[x2], x3 //st (row 4)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 5)
+
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v22.8b},[x2], x3 //st (row 5)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 6)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 7)
+
+ st1 {v20.8b},[x2], x3 //st (row 6)
+
+ subs x10, x10, #4 //subtract 8 and go to end if 8x8
+
+ st1 {v18.8b},[x2], x3 //st (row 7)
+
+ beq end_func
+
+ subs x11, x11, #8 //decrement the processed col
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+ add x20, x2, x7
+ csel x2, x20, x2,gt
+ csel x8, x12, x8,le
+ sub x20, x2, x4
+ csel x2, x20, x2,le
+ add x20, x2, #8
+ csel x2, x20, x2,le
+ lsl x20, x4, #1
+ csel x11,x20,x11,le
+ bgt lbl284
+ adrp x14, :got:col_for_intra_chroma
+ ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
+lbl284:
+ add x20, x0, #8
+ csel x0, x20, x0,le
+
+ ld1 {v31.8b},[x14],#8
+ smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ xtn v10.8b, v12.8h
+ sshr v12.8h, v12.8h,#5
+ sqxtn v11.8b, v12.8h
+ shl v11.8b, v11.8b,#1
+ mov x5, #0x302 //idx value for v is +1 of u
+ dup v27.4h,w5 //row value inc or reset accordingly
+ ldr w9, [x8] //loads index value
+ sxtw x9,w9
+ lsl x9, x9, #1
+ mov x5, #22
+ sub x5, x5, x0, lsl #1
+ dup v16.8b,w5
+ dup v26.8b,w9
+
+ mov x5,x2
+ sub v11.8b, v11.8b , v27.8b //ref_main_idx (sub row)
+
+kernel_8_16_32:
+ movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
+ sub v8.8b, v26.8b , v11.8b //ref_main_idx
+ mov v26.8b, v10.8b
+
+ subs x11, x11, #8
+ sub x6, x1, x9
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ add v8.8b, v8.8b , v16.8b //to compensate the pu1_src idx incremented by 8
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ add x20, x0, #8
+ csel x0, x20, x0,le
+ sub v9.8b, v8.8b , v29.8b //ref_main_idx - 2
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+
+ ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from most idx)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 5)
+
+ bgt lbl326
+ adrp x14, :got:col_for_intra_chroma
+ ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
+lbl326:
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ csel x8, x12, x8,le
+
+ mov x9,#0x302
+ dup v27.4h,w9 //row value inc or reset accordingly
+ sub v4.8b, v8.8b , v29.8b //ref_main_idx (row 1)
+
+ sub v5.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 1)
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+ movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ ld1 {v31.8b},[x14],#8
+ and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
+
+ lsl x20, x4, #1
+ csel x11,x20,x11,le
+ movi v29.8b, #4 //contains #2 for adding to get ref_main_idx + 1
+ ldr w9, [x8]
+ sxtw x9,w9
+
+ st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 2)
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+ sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 2)
+
+ lsl x9, x9, #1
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 3)
+
+ umull v22.8h, v10.8b, v7.8b //mul (row 1)
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+ st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
+
+ sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 4)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ add x5,x2,x3,lsl#2
+ smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ add x9, x9, x0, lsl #1
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+ sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ xtn v10.8b, v14.8h
+ sshr v14.8h, v14.8h,#5
+
+ sub v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
+ tbl v21.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+ sub v9.8b, v9.8b , v29.8b //ref_main_idx - 1 (row 6)
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+ sqxtn v11.8b, v14.8h
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+ dup v26.8b,w9
+
+ sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 7)
+
+ mov x6, #22 //to compensate the 2*row value
+ shl v11.8b, v11.8b,#1
+ sub x6, x6, x0, lsl #1
+
+ umull v22.8h, v21.8b, v7.8b //mul (row 5)
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+ add x2,x2,x3, lsl #2
+ dup v16.8b,w6
+ add x20, x7, x2
+ csel x2, x20, x2,gt
+
+ sub x20, x2, x4
+ csel x2, x20, x2,le
+ sub v11.8b, v11.8b , v27.8b //ref_main_idx (add row)
+ sub x20,x2,#8
+ csel x2, x20, x2,le
+
+ subs x10, x10, #4 //subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+
+epil_8_16_32:
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ rshrn v24.8b, v22.8h,#5 //round shft (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ st1 {v18.8b},[x5], x3 //st (row 7)
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_planar.s b/common/arm64/ihevc_intra_pred_chroma_planar.s
new file mode 100644
index 0000000..ac6b362
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_planar.s
@@ -0,0 +1,377 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_filters_planar.s
+//*
+//* @brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for planar input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode,
+// word32 pi1_coeff)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_planar_av8
+.extern gau1_ihevc_planar_factor
+
+
+.type ihevc_intra_pred_chroma_planar_av8, %function
+
+ihevc_intra_pred_chroma_planar_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
+ ldr x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
+
+ clz w5,w4
+ sub x20, x5, #32
+ neg x5, x20
+ dup v14.8h,w5
+ neg v14.8h, v14.8h //shr value (so vneg)
+ dup v2.8b,w4 //nt
+ dup v16.8h,w4 //nt
+
+ sub x6, x4, #1 //nt-1
+ add x6, x0,x6,lsl #1 //2*(nt-1)
+ ldr w7, [x6]
+ sxtw x7,w7
+ dup v0.4h,w7 //src[nt-1]
+
+ add x6, x4, x4,lsl #1 //3nt
+ add x6, x6, #1 //3nt + 1
+ lsl x6,x6,#1 //2*(3nt + 1)
+
+ add x6, x6, x0
+ ldr w7, [x6]
+ sxtw x7,w7
+ dup v1.4h,w7 //src[3nt+1]
+
+
+ add x6, x4, x4 //2nt
+ add x14, x6, #1 //2nt+1
+ lsl x14,x14,#1 //2*(2nt+1)
+ sub x6, x6, #1 //2nt-1
+ lsl x6,x6,#1 //2*(2nt-1)
+ add x6, x6, x0 //&src[2nt-1]
+ add x14, x14, x0 //&src[2nt+1]
+
+ mov x8, #1 //row+1 (row is first 0)
+ sub x9, x4, x8 //nt-1-row (row is first 0)
+
+ dup v5.8b,w8 //row + 1
+ dup v6.8b,w9 //nt - 1 - row
+ mov v7.8b, v5.8b //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+ add x12, x11, #1 //coeffs (to be reloaded after every row)
+ mov x1, x4 //nt (row counter) (dec after every row)
+ mov x5, x2 //dst (to be reloaded after every row and inc by dst_strd)
+ mov x10, #8 //increment for the coeffs
+ mov x0, x14 //&src[2nt+1] (to be reloaded after every row)
+
+ cmp x4, #4
+ beq tf_sz_4
+
+
+
+ mov x10,x6
+tf_sz_8_16:
+ ld1 {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
+ ld1 {v8.8b},[x12],#8
+ mov v9.8b, v8.8b
+ zip1 v29.8b, v8.8b, v9.8b
+ zip2 v9.8b, v8.8b, v9.8b
+ mov v8.d[0], v29.d[0]
+ sub v30.8b, v2.8b , v8.8b //[nt-1-col]
+ sub v31.8b, v2.8b , v9.8b
+
+
+
+
+loop_sz_8_16:
+
+ ldr w7, [x6], #-2 //src[2nt-1-row] (dec to take into account row)
+ sxtw x7,w7
+ umull v12.8h, v5.8b, v0.8b //(row+1) * src[nt-1]
+ ldr w11, [x6], #-2 //src[2nt-1-row] (dec to take into account row)
+ sxtw x11,w11
+ umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
+ dup v4.4h,w7 //src[2nt-1-row]
+ umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ dup v3.4h,w11 //src[2nt-1-row]
+ umlal v12.8h, v30.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
+
+
+
+ umull v28.8h, v5.8b, v0.8b
+ ldr w7, [x6], #-2 //src[2nt-1-row] (dec to take into account row)
+ sxtw x7,w7
+ umlal v28.8h, v6.8b, v11.8b
+ add v18.8b, v5.8b , v7.8b //row++ [(row+1)++]c
+
+
+ umlal v28.8h, v31.8b, v4.8b
+ sub v19.8b, v6.8b , v7.8b //[nt-1-row]--
+ umlal v28.8h, v9.8b, v1.8b
+ dup v4.4h,w7 //src[2nt-1-row]
+
+ umull v26.8h, v18.8b, v0.8b //(row+1) * src[nt-1]
+ add v12.8h, v12.8h , v16.8h //add (nt)
+ umlal v26.8h, v19.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
+ sshl v12.8h, v12.8h, v14.8h //shr
+ umlal v26.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ add v28.8h, v28.8h , v16.8h
+ umlal v26.8h, v30.8b, v3.8b //(nt-1-col) * src[2nt-1-row]
+ sshl v28.8h, v28.8h, v14.8h
+
+
+
+
+
+ umull v24.8h, v18.8b, v0.8b
+ add v5.8b, v18.8b , v7.8b //row++ [(row+1)++]
+ umlal v24.8h, v19.8b, v11.8b
+ sub v6.8b, v19.8b , v7.8b //[nt-1-row]--
+ umlal v24.8h, v9.8b, v1.8b
+ xtn v12.8b, v12.8h
+ umlal v24.8h, v31.8b, v3.8b
+ xtn v13.8b, v28.8h
+
+
+
+
+ add v26.8h, v26.8h , v16.8h //add (nt)
+ umull v22.8h, v5.8b, v0.8b //(row+1) * src[nt-1]
+ sshl v26.8h, v26.8h, v14.8h //shr
+ umlal v22.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
+ st1 {v12.2s, v13.2s}, [x2], x3
+ umlal v22.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ add v24.8h, v24.8h , v16.8h
+ umlal v22.8h, v30.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
+ sshl v24.8h, v24.8h, v14.8h
+
+ umull v20.8h, v5.8b, v0.8b
+ add v18.8b, v5.8b , v7.8b //row++ [(row+1)++]c
+ umlal v20.8h, v6.8b, v11.8b
+ sub v19.8b, v6.8b , v7.8b //[nt-1-row]--
+ umlal v20.8h, v31.8b, v4.8b
+
+ ldr w11, [x6], #-2 //src[2nt-1-row] (dec to take into account row)
+ sxtw x11,w11
+ umlal v20.8h, v9.8b, v1.8b
+ dup v3.4h,w11 //src[2nt-1-row]
+ add v22.8h, v22.8h , v16.8h //add (nt)
+
+ umull v12.8h, v18.8b, v0.8b //(row+1) * src[nt-1]
+ xtn v26.8b, v26.8h
+ umlal v12.8h, v19.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
+ xtn v27.8b, v24.8h
+
+ umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ sshl v22.8h, v22.8h, v14.8h //shr
+
+ umlal v12.8h, v30.8b, v3.8b //(nt-1-col) * src[2nt-1-row]
+ add v20.8h, v20.8h , v16.8h
+
+ umull v28.8h, v18.8b, v0.8b
+ st1 {v26.2s, v27.2s}, [x2], x3
+
+ umlal v28.8h, v19.8b, v11.8b
+ add v5.8b, v18.8b , v7.8b //row++ [(row+1)++]
+
+ sub v6.8b, v19.8b , v7.8b //[nt-1-row]--
+ umlal v28.8h, v9.8b, v1.8b
+
+ umlal v28.8h, v31.8b, v3.8b
+ sshl v20.8h, v20.8h, v14.8h
+
+
+ add v12.8h, v12.8h , v16.8h //add (nt)
+ xtn v22.8b, v22.8h
+
+
+ add v28.8h, v28.8h , v16.8h
+ xtn v23.8b, v20.8h
+
+
+ sshl v12.8h, v12.8h, v14.8h //shr
+ st1 {v22.2s, v23.2s}, [x2], x3
+ sshl v28.8h, v28.8h, v14.8h
+
+
+
+
+
+ xtn v20.8b, v12.8h
+ xtn v21.8b, v28.8h
+
+ st1 {v20.2s, v21.2s}, [x2], x3
+
+
+ subs x1, x1, #4
+
+ bne loop_sz_8_16
+
+
+
+
+ cmp x4,#16
+
+ bne end_loop
+
+
+ sub x4, x4,#16
+ dup v5.8b,w8 //row + 1
+ dup v6.8b,w9 //nt - 1 - row
+ mov v7.8b, v5.8b //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+ mov x6,x10
+ mov x1,#16
+ sub x2,x2,x3,lsl #4
+ add x2,x2,#16
+
+ ld1 {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
+ ld1 {v8.8b},[x12],#8
+ mov v9.8b, v8.8b
+ zip1 v29.8b, v8.8b, v9.8b
+ zip2 v9.8b, v8.8b, v9.8b
+ mov v8.d[0], v29.d[0]
+ sub v30.8b, v2.8b , v8.8b //[nt-1-col]
+ sub v31.8b, v2.8b , v9.8b
+
+ beq loop_sz_8_16
+
+
+
+tf_sz_4:
+ ld1 {v10.8b},[x14] //load src[2nt+1+col]
+ ld1 {v8.8b},[x12], x10 //load 8 coeffs [col+1]
+ mov v9.8b, v8.8b
+ zip1 v29.8b, v8.8b, v9.8b
+ zip2 v9.8b, v8.8b, v9.8b
+ mov v8.d[0], v29.d[0]
+loop_sz_4:
+ //mov x10, #4 @reduce inc to #4 for 4x4
+ ldr w7, [x6], #-2 //src[2nt-1-row] (dec to take into account row)
+ sxtw x7,w7
+ dup v4.4h,w7 //src[2nt-1-row]
+
+ sub v9.8b, v2.8b , v8.8b //[nt-1-col]
+
+ umull v12.8h, v5.8b, v0.8b //(row+1) * src[nt-1]
+ umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
+ umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v12.8h, v9.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
+// vadd.i16 q6, q6, q8 @add (nt)
+// vshl.s16 q6, q6, q7 @shr
+// vmovn.i16 d12, q6
+ rshrn v12.8b, v12.8h,#3
+
+ st1 {v12.2s},[x2], x3
+
+ add v5.8b, v5.8b , v7.8b //row++ [(row+1)++]
+ sub v6.8b, v6.8b , v7.8b //[nt-1-row]--
+ subs x1, x1, #1
+
+ bne loop_sz_4
+
+end_loop:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_chroma_ver.s b/common/arm64/ihevc_intra_pred_chroma_ver.s
new file mode 100644
index 0000000..8d1daf7
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_chroma_ver.s
@@ -0,0 +1,232 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_chroma_ver_neon.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_ver_av8
+
+.type ihevc_intra_pred_chroma_ver_av8, %function
+
+ihevc_intra_pred_chroma_ver_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ lsl x5, x4, #2 //4nt
+
+
+ cmp x4, #8
+ beq blk_8
+ blt blk_4
+
+copy_16:
+ add x5, x5, #2 //2nt+2
+ add x6, x0, x5 //&src[2nt+1]
+
+ add x5, x2, x3 //pu1_dst + dst_strd
+ ld2 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15)
+ add x8, x5, x3
+
+ add x10, x8, x3
+ ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31)
+ lsl x11, x3, #2
+
+ add x11, x11, #-16
+
+
+ st2 {v20.8b, v21.8b}, [x2],#16
+ st2 {v20.8b, v21.8b}, [x5],#16
+ st2 {v20.8b, v21.8b}, [x8],#16
+ st2 {v20.8b, v21.8b}, [x10],#16
+
+ st2 {v22.8b, v23.8b}, [x2], x11
+ st2 {v22.8b, v23.8b}, [x5], x11
+ st2 {v22.8b, v23.8b}, [x8], x11
+ st2 {v22.8b, v23.8b}, [x10], x11
+
+ subs x4, x4, #4
+
+kernel_copy_16:
+ st2 {v20.8b, v21.8b}, [x2],#16
+ st2 {v20.8b, v21.8b}, [x5],#16
+ st2 {v20.8b, v21.8b}, [x8],#16
+ st2 {v20.8b, v21.8b}, [x10],#16
+
+ st2 {v22.8b, v23.8b}, [x2], x11
+ st2 {v22.8b, v23.8b}, [x5], x11
+ st2 {v22.8b, v23.8b}, [x8], x11
+ st2 {v22.8b, v23.8b}, [x10], x11
+
+ subs x4, x4, #4
+
+
+ st2 {v20.8b, v21.8b}, [x2],#16
+ st2 {v20.8b, v21.8b}, [x5],#16
+ st2 {v20.8b, v21.8b}, [x8],#16
+ st2 {v20.8b, v21.8b}, [x10],#16
+
+ st2 {v22.8b, v23.8b}, [x2], x11
+ st2 {v22.8b, v23.8b}, [x5], x11
+ st2 {v22.8b, v23.8b}, [x8], x11
+ st2 {v22.8b, v23.8b}, [x10], x11
+
+ subs x4, x4, #4
+
+ st2 {v20.8b, v21.8b}, [x2],#16
+ st2 {v20.8b, v21.8b}, [x5],#16
+ st2 {v20.8b, v21.8b}, [x8],#16
+ st2 {v20.8b, v21.8b}, [x10],#16
+
+ st2 {v22.8b, v23.8b}, [x2], x11
+ st2 {v22.8b, v23.8b}, [x5], x11
+ st2 {v22.8b, v23.8b}, [x8], x11
+ st2 {v22.8b, v23.8b}, [x10], x11
+
+ subs x4, x4, #4
+ bne kernel_copy_16
+
+ b end_func
+
+blk_8:
+
+ add x5, x5, #2 //2nt+2
+ add x6, x0, x5 //&src[2nt+1]
+
+ add x5, x2, x3 //pu1_dst + dst_strd
+ ld2 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15)
+ add x8, x5, x3
+
+ add x10, x8, x3
+ ld2 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31)
+
+ lsl x11,x3,#2
+
+ st2 {v20.8b, v21.8b}, [x2],x11
+ st2 {v20.8b, v21.8b}, [x5],x11
+ st2 {v20.8b, v21.8b}, [x8],x11
+ st2 {v20.8b, v21.8b}, [x10],x11
+
+ st2 {v20.8b, v21.8b}, [x2]
+ st2 {v20.8b, v21.8b}, [x5]
+ st2 {v20.8b, v21.8b}, [x8]
+ st2 {v20.8b, v21.8b}, [x10]
+
+ subs x4, x4, #8
+ beq end_func
+
+blk_4:
+
+ //lsl x5, x4, #2 @4nt
+ add x5, x5, #2 //2nt+2
+ add x6, x0, x5 //&src[2nt+1]
+
+ ld1 {v0.8b},[x6]
+ add x5, x2, x3 //pu1_dst + dst_strd
+
+ st1 {v0.8b},[x2]
+ add x8, x5, x3
+ st1 {v0.8b},[x5]
+ add x10, x8, x3
+ st1 {v0.8b},[x8]
+ st1 {v0.8b},[x10]
+
+
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
new file mode 100644
index 0000000..e9f83ff
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@@ -0,0 +1,623 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_chroma_mode_11_to_17.s
+//*
+//* @brief
+//* contains function definitions for intra prediction chroma mode 11 to 17
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_chroma_mode_11_to_17_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_chroma
+.extern idx_neg_idx_chroma_11_17
+
+.type ihevc_intra_pred_chroma_mode_11_to_17_av8, %function
+
+ihevc_intra_pred_chroma_mode_11_to_17_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x7, :got:gai4_ihevc_ang_table
+ ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+ adrp x8, :got:gai4_ihevc_inv_ang_table
+ ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+ add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode]
+ add x8, x8, x5, lsl #2 //gai4_ihevc_inv_ang_table[mode - 11]
+ sub x8, x8, #44
+
+ ldr w7, [x7] //intra_pred_ang
+ sxtw x7,w7
+ sub sp, sp, #132 //ref_temp[2 * max_cu_size + 2]
+
+ ldr w8, [x8] //inv_ang
+ sxtw x8,w8
+ add x6, sp, x4, lsl #1 //ref_temp + 2 * nt
+
+ mul x9, x4, x7 //nt*intra_pred_ang
+
+ sub x6, x6, #2 //ref_temp + 2*nt - 2
+
+ add x1, x0, x4, lsl #2 //x1 = &src[4nt]
+ dup v30.8b,w7 //intra_pred_ang
+
+ mov x7, x4
+
+ sub x1,x1,#6 //address calculation for copying 4 halfwords
+
+ asr x9, x9, #5
+
+ ld1 {v0.8b},[x1]
+ rev64 v0.4h, v0.4h
+ st1 {v0.8b},[x6],#8
+
+ sub x1, x1,#8
+
+ subs x7, x7, #4
+ add x20, x1,#8
+ csel x1, x20, x1,eq
+ beq end_loop_copy
+ subs x7,x7,#4
+ beq loop_copy_8
+ subs x7,x7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ sub x1, x1,#24
+ ld1 {v0.16b, v1.16b},[x1]
+
+ sub x1, x1,#24
+ ld1 {v0.16b, v1.16b},[x1],#32
+
+ rev64 v6.4h, v6.4h
+ rev64 v5.4h, v5.4h
+ rev64 v4.4h, v4.4h
+ rev64 v3.4h, v3.4h
+ rev64 v2.4h, v2.4h
+ rev64 v1.4h, v1.4h
+ rev64 v0.4h, v0.4h
+
+ st1 {v6.8b},[x6],#8
+ st1 {v5.8b},[x6],#8
+ st1 {v4.8b},[x6],#8
+ st1 {v3.8b},[x6],#8
+ st1 {v2.8b},[x6],#8
+ st1 {v1.8b},[x6],#8
+ st1 {v0.8b},[x6],#8
+
+ ld1 {v4.8b, v5.8b, v6.8b},[x1],#24
+ b end_loop_copy
+
+loop_copy_16:
+ sub x1, x1,#16
+ ld1 {v0.8b, v1.8b, v2.8b},[x1]
+
+ rev64 v2.4h, v2.4h
+ rev64 v1.4h, v1.4h
+ rev64 v0.4h, v0.4h
+
+ st1 {v2.8b},[x6],#8
+ st1 {v1.8b},[x6],#8
+ st1 {v0.8b},[x6],#8
+
+ b end_loop_copy
+loop_copy_8:
+ ld1 {v0.8b},[x1]
+ rev64 v0.4h, v0.4h
+ st1 {v0.8b},[x6],#8
+end_loop_copy:
+ sub x1, x1,#2
+
+ ldrh w11, [x1], #-2
+ sxtw x11,w11
+ strh w11, [x6], #2
+ sxtw x11,w11
+
+ cmp x9, #-1
+ bge prologue_8_16_32
+
+ add x6, sp, x4, lsl #1 //ref_temp + 2 * nt
+ sub x6, x6, #4 //ref_temp + 2 * nt - 2 - 2
+
+ mov x12, #-1
+
+ sub x20, x9, x12 //count to take care off ref_idx
+ neg x9, x20
+
+ add x1, x0, x4, lsl #2 //x1 = &src[4nt]
+
+ mov x7, #128 //inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add x7, x7, x8 //inv_ang_sum += inv_ang
+
+ lsr x0, x7, #8
+ lsl x0, x0, #1
+
+ ldrh w11, [x1, x0]
+ sxtw x11,w11
+ strh w11, [x6], #-2
+ sxtw x11,w11
+
+ subs x9, x9, #1
+
+ bne loop_copy_ref_idx
+
+prologue_8_16_32:
+
+ adrp x14, :got:col_for_intra_chroma
+ ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
+
+ lsr x10, x4, #3
+ ld1 {v31.8b},[x14],#8
+ mul x10, x4, x10 //block counter (dec by #8)
+
+ lsl x11, x4, #1 //col counter to be inc/dec by #8
+ smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+
+ sub x7, x5, #11
+
+ adrp x12, :got:idx_neg_idx_chroma_11_17 //load least idx table
+ ldr x12, [x12, #:got_lo12:idx_neg_idx_chroma_11_17]
+
+ add x12, x12, x7, lsl #4
+ mov x8, x12
+
+ mov x7, #8
+ sub x7, x7, x3, lsl #3 //x7 = 8-8x3
+
+ ldr w9, [x8]
+ sxtw x9,w9
+ lsl x9, x9, #1
+ add x1, sp, x4, lsl #1 //ref_temp + 2nt
+
+ xtn v6.8b, v22.8h
+ dup v26.8b,w9 //least idx added to final idx values
+ sub x1, x1, #2 //ref_temp + 2nt - 2
+
+ add x6, x1, x9
+
+ ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+ sshr v22.8h, v22.8h,#5
+
+// mov x0, #31
+ movi v29.8b, #31 //contains #31 for vand operation
+
+// mov x0, #32
+ movi v28.8b, #32
+
+ sqxtn v8.8b, v22.8h
+ shl v8.8b, v8.8b,#1 // 2 * idx
+
+ and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
+
+// mov x0, #2
+ movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
+
+ mov x0,#0x100 // idx value for v is +1 of u
+ dup v27.4h,w0
+ add v27.8b, v27.8b , v29.8b
+ mov x0,#0
+
+ add v8.8b, v8.8b , v27.8b //ref_main_idx (add row)
+ sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0)
+ add v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0)
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+ add v4.8b, v8.8b , v29.8b //ref_main_idx (row 1)
+ add v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1)
+
+// mov x0, #4 @ 2 *(row * 2 )
+ movi v29.8b, #4
+
+ tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+ add v8.8b, v8.8b , v29.8b //ref_main_idx (row 2)
+ add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
+ add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
+ add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+ add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
+ add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+
+ tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+ add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
+ add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ cmp x4,#4
+ beq end_func
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ umull v22.8h, v16.8b, v7.8b //mul (row 5)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
+ add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
+
+ st1 {v24.8b},[x2], x3 //st (row 4)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 5)
+
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v22.8b},[x2], x3 //st (row 5)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 6)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 7)
+
+ st1 {v20.8b},[x2], x3 //st (row 6)
+
+ subs x10, x10, #4 //subtract 8 and go to end if 8x8
+
+ st1 {v18.8b},[x2], x3 //st (row 7)
+
+ beq end_func
+
+ subs x11, x11, #8
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+ add x20, x2, x7
+ csel x2, x20, x2,gt
+ csel x8, x12, x8,le
+ sub x20, x2, x4
+ csel x2, x20, x2,le
+ add x20, x2, #8
+ csel x2, x20, x2,le
+ lsl x20, x4, #1
+ csel x11,x20,x11,le
+ bgt lbl400
+ adrp x14, :got:col_for_intra_chroma
+ ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
+lbl400:
+ add x20, x0, #8
+ csel x0, x20, x0,le
+
+ ld1 {v31.8b},[x14],#8
+ smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ xtn v10.8b, v12.8h
+ sshr v12.8h, v12.8h,#5
+ sqxtn v11.8b, v12.8h
+ shl v11.8b, v11.8b,#1
+ orr x5,x0,x0, lsl#8
+ add x5, x5,#0x002
+ add x5, x5,#0x300
+ dup v27.4h,w5 //row value inc or reset accordingly
+ ldr w9, [x8]
+ sxtw x9,w9
+ lsl x9, x9, #1
+ add x9, x9, x0, lsl #1
+// sub x9, x9, #1
+ dup v26.8b,w9
+ add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ mov x5,x2
+
+// sub x4,x4,#8
+
+kernel_8_16_32:
+ movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
+
+ sub v8.8b, v8.8b , v26.8b //ref_main_idx
+ mov v26.8b, v10.8b
+
+ subs x11, x11, #8
+ add x6, x1, x9
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+ add v9.8b, v29.8b , v8.8b //ref_main_idx + 1
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ add x20, x0, #8
+ csel x0, x20, x0,le
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+ ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ rshrn v24.8b, v22.8h,#5 //round shft (row 5)
+
+ csel x8, x12, x8,le
+ orr x9,x0,x0, lsl#8
+ lsl x9, x9, #1
+ add x9, x9,#0x002
+ add x9, x9,#0x300
+ dup v27.4h,w9 //row value inc or reset accordingly
+
+ bgt lbl452
+ adrp x14, :got:col_for_intra_chroma
+ ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
+lbl452:
+
+ add v4.8b, v29.8b , v8.8b //ref_main_idx (row 1)
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
+ add v5.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 1)
+
+ movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ ld1 {v31.8b},[x14],#8
+ and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
+
+ movi v29.8b, #4 //contains #2 for adding to get ref_main_idx + 1
+
+ st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ add v8.8b, v29.8b , v8.8b //ref_main_idx (row 2)
+ tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
+ add v9.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 2)
+
+ lsl x20, x4, #1
+ csel x11,x20,x11,le
+ ldr w9, [x8]
+ sxtw x9,w9
+ lsl x9, x9, #1
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
+ add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
+
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+ st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
+
+ add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
+ add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ add x5,x2,x3,lsl#2
+ add x9, x9, x0, lsl #1
+
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
+ tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
+ add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ xtn v10.8b, v14.8h
+ sshr v14.8h, v14.8h,#5
+
+ add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
+ tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
+ add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+
+// sub x9, x9, #1
+ sqxtn v11.8b, v14.8h
+
+ add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
+ tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
+ add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
+
+ shl v11.8b, v11.8b,#1
+
+ umull v22.8h, v16.8b, v7.8b //mul (row 5)
+ tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ dup v26.8b,w9
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+
+ add x2, x2, x3, lsl #2
+ add x20, x7, x2
+ csel x2, x20, x2,gt
+ sub x20, x2, x4, lsl #1
+ csel x2, x20, x2,le
+ add x20,x2,#8
+ csel x2, x20, x2,le
+
+ subs x10, x10, #4 //subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+epil_8_16_32:
+
+ tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ rshrn v24.8b, v22.8h,#5 //round shft (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ st1 {v18.8b},[x5], x3 //st (row 7)
+
+end_func:
+ add sp, sp, #132
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
new file mode 100644
index 0000000..3af2da7
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@@ -0,0 +1,575 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_chroma_mode_19_to_25.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* naveen sr
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* chroma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_chroma_mode_19_to_25(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_chroma_mode_19_to_25_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_chroma_mode_19_to_25_av8, %function
+
+ihevc_intra_pred_chroma_mode_19_to_25_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x7, :got:gai4_ihevc_ang_table
+ ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+ adrp x8, :got:gai4_ihevc_inv_ang_table
+ ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+ add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode]
+ add x8, x8, x5, lsl #2 //gai4_ihevc_inv_ang_table
+ sub x8, x8, #48 //gai4_ihevc_inv_ang_table[mode - 12]
+
+ ldr w7, [x7] //intra_pred_ang
+ sxtw x7,w7
+ sub sp, sp, #132 //ref_temp[2 * max_cu_size + 2]
+
+ ldr w8, [x8] //inv_ang
+ sxtw x8,w8
+ add x6, sp, x4 , lsl #1 //ref_temp + 2 * nt
+
+ mul x9, x4, x7 //nt*intra_pred_ang
+
+ sub x6, x6, #2 //ref_temp + 2*nt - 2
+
+ add x1, x0, x4, lsl #2 //x1 = &src[4nt]
+ dup v30.8b,w7 //intra_pred_ang
+
+ mov x7, x4
+
+ asr x9, x9, #5
+
+ ld1 {v0.2s},[x1],#8 // pu1_ref[two_nt + k]
+
+ st1 {v0.2s},[x6],#8 //ref_temp[k + nt - 1] = pu1_ref[two_nt + k]//
+
+ subs x7, x7, #4
+ beq end_loop_copy
+ subs x7,x7,#4
+ beq loop_copy_8
+ subs x7,x7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ ld1 {v0.8b, v1.8b, v2.8b, v3.8b},[x1],#32
+ ld1 {v4.8b, v5.8b, v6.8b},[x1],#24
+
+ st1 {v0.8b, v1.8b, v2.8b, v3.8b},[x6],#32
+
+
+ st1 {v4.8b, v5.8b, v6.8b},[x6],#24
+ b end_loop_copy
+
+loop_copy_16:
+ ld1 {v0.8b, v1.8b, v2.8b},[x1],#24
+ st1 {v0.8b, v1.8b, v2.8b},[x6],#24
+
+ b end_loop_copy
+
+loop_copy_8:
+ ld1 {v0.8b},[x1],#8
+ st1 {v0.8b},[x6],#8
+
+end_loop_copy:
+
+ ldrh w11, [x1]
+ sxtw x11,w11
+ strh w11, [x6]
+ sxtw x11,w11
+
+ cmp x9, #-1
+ bge linear_filtering
+
+ add x6, sp, x4 ,lsl #1 //ref_temp + 2 * nt
+ sub x6, x6, #4 //ref_temp + 2 * nt - 2 - 2
+
+ mov x12, #-1
+
+ sub x20, x9, x12 //count to take care off ref_idx
+ neg x9, x20
+
+ add x1, x0, x4, lsl #2 //x1 = &src[2nt]
+
+ mov x7, #128 //inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add x7, x7, x8 //inv_ang_sum += inv_ang
+ lsr x0, x7, #8
+ lsl x0, x0, #1
+ neg x20,x0
+ ldrh w11, [x1, x20]
+ sxtw x11,w11
+ strh w11, [x6], #-2
+ sxtw x11,w11
+
+ subs x9, x9, #1
+
+ bne loop_copy_ref_idx
+
+
+linear_filtering:
+// after copy
+// below code is taken from mode 27 to 33 and modified
+
+
+ adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+ ldr x6, [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+ lsl x7,x4,#2 //four_nt
+
+ add x8,x6,x5,lsl #2 //*gai4_ihevc_ang_table[mode]
+ ldr w9, [x8] //intra_pred_ang = gai4_ihevc_ang_table[mode]
+ sxtw x9,w9
+ adrp x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+ ldr x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+
+ add x6,x1,#1
+
+ add x8, sp, x4, lsl #1 //ref_temp + 2 * nt
+ sub x8, x8,#2 //ref_temp + 2*nt -2
+
+ mov x14,#0 //row
+ mov x12,x4
+ lsl x4,x4,#1
+
+core_loop_8:
+ add x8,x8,#2 //pu1_ref_main_idx += (four_nt + 1)
+ dup v0.8b,w9 //intra_pred_ang
+ lsr x12, x4, #4 //divide by 8
+
+ movi v1.8b, #32
+ mul x7, x4, x12
+
+ movi v6.8h, #31
+
+
+ mov x1,x8
+
+ mov x5,x4
+ mov x11,#2
+
+prologue:
+ ld1 {v3.8b},[x6] //loads the row value
+ smull v2.8h, v3.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v5.8b, v2.8h,#5 //idx = pos >> 5
+ shl v5.8b, v5.8b,#1
+
+ dup v31.8b, v4.8b[0]
+ add x0,x2,x3
+
+ smov x14, v5.2s[0] //(i row)extract idx to the r register
+// lsl x14,x14,#1
+
+ dup v29.8b, v4.8b[1] //(ii)
+ sbfx x9,x14,#0,#8
+
+ add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
+
+ ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ sbfx x9,x14,#8,#8
+
+ ld1 {v9.8b},[x10] //(i row)ref_main_idx_1
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ sbfx x9,x14,#16,#8
+ sub v30.8b, v1.8b , v31.8b //32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ sbfx x9,x14,#24,#8
+
+ dup v25.8b, v4.8b[3] //(iv)
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+
+ dup v31.8b, v4.8b[4] //(v)
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ smov x14, v5.2s[1] //extract idx to the r register
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+// lsl x14,x14,#1
+
+ st1 {v10.8b},[x2],#8 //(i row)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx x9,x14,#0,#8
+ dup v29.8b, v4.8b[5] //(vi)
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ sbfx x9,x14,#8,#8
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+ dup v27.8b, v4.8b[6] //(vii)
+
+ sbfx x9,x14,#16,#8
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v25.8b, v4.8b[7] //(viii)
+ sbfx x9,x14,#24,#8
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs x7,x7,#8
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ cmp x4,#8 // go to end if 4x4
+ beq end_loops
+
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x20,x8,#8
+ csel x8, x20, x8,gt
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ sub x20,x4,#8
+ csel x4, x20, x4,gt
+
+ st1 {v10.8b},[x0],x3 //(v)
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ ld1 {v5.8b},[x6] //loads the row value
+ smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+ shl v3.8b, v3.8b,#1
+ smov x14, v3.2s[0] //(i)extract idx to the r register
+// lsl x14,x14,#1
+ sbfx x9,x14,#0,#8
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ dup v31.8b, v4.8b[0]
+ subs x4,x4,#8
+ sbfx x9,x14,#8,#8
+
+ ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+
+ add x20,x6,#8 //increment the row value
+ csel x6, x20, x6,le
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ ld1 {v9.8b},[x10] //(i)ref_main_idx_1
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v5.8b},[x6] //loads the row value
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ dup v29.8b, v4.8b[1] //(ii)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx x9,x14,#16,#8
+
+ st1 {v14.8b},[x0],x3 //(vi)
+ sub v30.8b, v1.8b , v31.8b //(i)32-fract(dup_const_32_fract)
+
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx x9,x14,#24,#8
+ csel x4, x5, x4,le //reload nt
+
+ smov x14, v3.2s[1] //extract idx to the r register
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v25.8b, v4.8b[3] //(iv)
+ smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+
+ st1 {v22.8b},[x0] //(viii)
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+// lsl x14,x14,#1
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx x9,x14,#0,#8
+ add x0,x2,x3
+
+ dup v31.8b, v4.8b[4] //(v)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+ sbfx x9,x14,#8,#8
+
+ st1 {v10.8b},[x2],#8 //(i)
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+
+ dup v29.8b, v4.8b[5] //(vi)
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ dup v27.8b, v4.8b[6] //(vii)
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+ sbfx x9,x14,#16,#8
+
+ dup v25.8b, v4.8b[7] //(viii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+ sbfx x9,x14,#24,#8
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+
+ shl v3.8b, v3.8b,#1
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ smov x14, v3.2s[0] //(i)extract idx to the r register
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+ csel x8, x1, x8,le //reload the source to pu1_src+2nt
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+
+ add x20,x8,#8 //increment the source next set 8 columns in same row
+ csel x8, x20, x8,gt
+ lsl x20, x3,#3
+ csel x12,x20,x12,le
+ sub x20,x12,x5
+ csel x12, x20, x12,le
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ st1 {v10.8b},[x0],x3 //(v)
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add x20,x2,x12 //increment the dst pointer to 8*dst_strd - nt
+ csel x2, x20, x2,le
+ sbfx x9,x14,#0,#8
+
+ xtn v4.8b, v4.8h
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+// lsl x14,x14,#1
+
+ subs x7,x7,#8
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ st1 {v14.8b},[x0],x3 //(vi)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ st1 {v22.8b},[x0],x3 //(viii)
+ b end_loops
+
+core_loop_4:
+
+end_loops:
+ add sp, sp, #132
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
new file mode 100644
index 0000000..1502ad6
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@@ -0,0 +1,697 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode_11_to_17.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_11_to_17_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_11_17
+
+.type ihevc_intra_pred_luma_mode_11_to_17_av8, %function
+
+ihevc_intra_pred_luma_mode_11_to_17_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x7, :got:gai4_ihevc_ang_table
+ ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+ adrp x8, :got:gai4_ihevc_inv_ang_table
+ ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+ add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode]
+ add x8, x8, x5, lsl #2 //gai4_ihevc_inv_ang_table[mode - 11]
+ sub x8, x8, #44
+
+ ldr w7, [x7] //intra_pred_ang
+ sxtw x7,w7
+ sub sp, sp, #132 //ref_temp[2 * max_cu_size + 1]
+
+ ldr w8, [x8] //inv_ang
+ sxtw x8,w8
+ add x6, sp, x4 //ref_temp + nt
+
+ mul x9, x4, x7 //nt*intra_pred_ang
+
+ sub x6, x6, #1 //ref_temp + nt - 1
+
+ add x1, x0, x4, lsl #1 //x1 = &src[2nt]
+ dup v30.8b,w7 //intra_pred_ang
+
+ mov x7, x4
+
+ ldrb w11, [x1], #-1
+ sxtw x11,w11
+
+ asr x9, x9, #5
+
+ ldrb w12, [x1], #-1
+ sxtw x12,w12
+ ldrb w10, [x1], #-1
+ sxtw x10,w10
+ ldrb w14, [x1], #-1
+ sxtw x14,w14
+
+ strb w11, [x6], #1
+ sxtw x11,w11
+ strb w12, [x6], #1
+ sxtw x12,w12
+ strb w10, [x6], #1
+ sxtw x10,w10
+ strb w14, [x6], #1
+ sxtw x14,w14
+
+ subs x7, x7, #4
+ beq end_loop_copy
+
+ sub x6, x6,#4
+ sub x1, x1,#3
+
+ subs x7,x7,#4
+ beq loop_copy_8
+ subs x7,x7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ ld1 {v0.8b},[x1]
+ sub x1, x1,#8
+ ld1 {v1.8b},[x1]
+ sub x1, x1,#8
+ ld1 {v2.8b},[x1]
+ sub x1, x1,#8
+ ld1 {v3.8b},[x1]
+
+ rev64 v0.8b, v0.8b
+ rev64 v1.8b, v1.8b
+ st1 {v0.8b},[x6],#8
+ rev64 v2.8b, v2.8b
+ st1 {v1.8b},[x6],#8
+ rev64 v3.8b, v3.8b
+ st1 {v2.8b},[x6],#8
+ st1 {v3.8b},[x6],#8
+ sub x1, x1,#1
+ b end_loop_copy
+
+loop_copy_16:
+ ld1 {v0.8b},[x1]
+ sub x1, x1,#8
+ ld1 {v1.8b},[x1]
+
+ rev64 v0.8b, v0.8b
+ rev64 v1.8b, v1.8b
+
+ st1 {v0.8b},[x6],#8
+ st1 {v1.8b},[x6],#8
+ sub x1, x1,#1
+ b end_loop_copy
+
+loop_copy_8:
+ ld1 {v0.8b},[x1]
+ rev64 v0.8b, v0.8b
+ st1 {v0.8b},[x6],#8
+ sub x1, x1,#1
+end_loop_copy:
+
+ ldrb w11, [x1], #-1
+ sxtw x11,w11
+ strb w11, [x6], #1
+ sxtw x11,w11
+
+ cmp x9, #-1
+ bge prologue_8_16_32
+
+ add x6, sp, x4 //ref_temp + nt
+ sub x6, x6, #2 //ref_temp + nt - 2
+
+ mov x12, #-1
+
+ sub x20, x9, x12 //count to take care off ref_idx
+ neg x9, x20
+
+ add x1, x0, x4, lsl #1 //x1 = &src[2nt]
+
+ mov x7, #128 //inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add x7, x7, x8 //inv_ang_sum += inv_ang
+
+ lsr x20, x7, #8
+ ldrb w11, [x1, x20]
+ strb w11, [x6], #-1
+
+ subs x9, x9, #1
+
+ bne loop_copy_ref_idx
+
+prologue_8_16_32:
+ cmp x4, #4
+ beq sz_4_proc
+ adrp x14, :got:col_for_intra_luma
+ ldr x14, [x14, #:got_lo12:col_for_intra_luma]
+
+ lsr x10, x4, #3
+ ld1 {v31.8b},[x14],#8
+ mul x10, x4, x10 //block counter (dec by #8)
+
+ mov x11, x4 //col counter to be inc/dec by #8
+ smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ mov x0, #1
+
+ sub x7, x5, #11
+ dup v2.8b,w0 //contains #1 for adding to get ref_main_idx + 1
+
+ adrp x12, :got:idx_neg_idx_11_17 //load least idx table
+ ldr x12, [x12, #:got_lo12:idx_neg_idx_11_17]
+
+ mov x0, #2
+ dup v3.8b,w0
+
+ add x12, x12, x7, lsl #4
+ mov x8, x12
+
+ mov x7, #8
+ sub x7, x7, x3, lsl #3 //x7 = 8-8x3
+
+ ldr w9, [x8]
+ sxtw x9,w9
+ add x1, sp, x4 //ref_temp + nt
+
+ xtn v6.8b, v22.8h
+ dup v26.8b,w9 //least idx added to final idx values
+ sub x1, x1, #1 //ref_temp + nt - 1
+
+ add x6, x1, x9
+
+ ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+ sshr v22.8h, v22.8h,#5
+
+ mov x0, #31
+ dup v29.8b,w0 //contains #31 for vand operation
+
+ mov x0, #32
+ dup v28.8b,w0
+
+ sqxtn v8.8b, v22.8h
+
+ and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
+
+ mov x0, #1
+ dup v27.8b,w0 //row value inc or reset accordingly
+
+ add v8.8b, v8.8b , v27.8b //ref_main_idx (add row)
+ sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0)
+ add v9.8b, v8.8b , v2.8b //ref_main_idx + 1 (row 0)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+ add v4.8b, v8.8b , v2.8b //ref_main_idx (row 1)
+ add v5.8b, v9.8b , v2.8b //ref_main_idx + 1 (row 1)
+
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
+ add v8.8b, v8.8b , v3.8b //ref_main_idx (row 2)
+ add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 2)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
+ add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3)
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ add v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
+ add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
+ add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
+ add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5)
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5)
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
+ add v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
+ add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6)
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ umull v22.8h, v16.8b, v7.8b //mul (row 5)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
+ add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7)
+
+ st1 {v24.8b},[x2], x3 //st (row 4)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 5)
+
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v22.8b},[x2], x3 //st (row 5)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 6)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 7)
+
+ st1 {v20.8b},[x2], x3 //st (row 6)
+
+ subs x10, x10, #8 //subtract 8 and go to end if 8x8
+
+ st1 {v18.8b},[x2], x3 //st (row 7)
+
+ beq end_func
+
+ subs x11, x11, #8
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+ add x20, x2, x7
+ csel x2, x20, x2,gt
+ csel x8, x12, x8,le
+ sub x20, x2, x4
+ csel x2, x20, x2,le
+ add x20, x2, #8
+ csel x2, x20, x2,le
+ csel x11, x4, x11,le
+ bgt lbl390
+ adrp x14, :got:col_for_intra_luma
+ ldr x14, [x14, #:got_lo12:col_for_intra_luma]
+lbl390:
+ add x20, x0, #8
+ csel x0, x20, x0,le
+
+ mov x5,x2
+ ld1 {v31.8b},[x14],#8
+ smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ xtn v10.8b, v12.8h
+ sshr v12.8h, v12.8h,#5
+ sqxtn v11.8b, v12.8h
+ dup v27.8b,w0 //row value inc or reset accordingly
+ ldr w9, [x8]
+ sxtw x9,w9
+ add x9, x0, x9
+ sub x9, x9, #1
+ dup v26.8b,w9
+ add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+
+ sub x4,x4,#8
+
+kernel_8_16_32:
+
+ sub v8.8b, v8.8b , v26.8b //ref_main_idx
+ mov v26.8b, v10.8b
+
+ subs x11, x11, #8
+ add x6, x1, x9
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ add v9.8b, v2.8b , v8.8b //ref_main_idx + 1
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ add x20, x0, #8
+ csel x0, x20, x0,le
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+ ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ rshrn v24.8b, v22.8h,#5 //round shft (row 5)
+
+ bgt lbl429
+ adrp x14, :got:col_for_intra_luma
+ ldr x14, [x14, #:got_lo12:col_for_intra_luma]
+lbl429:
+ csel x8, x12, x8,le
+ dup v27.8b,w0 //row value inc or reset accordingly
+
+ add v4.8b, v2.8b , v8.8b //ref_main_idx (row 1)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ add v5.8b, v2.8b , v9.8b //ref_main_idx + 1 (row 1)
+
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ ld1 {v31.8b},[x14],#8
+ and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
+
+ st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ add v8.8b, v3.8b , v8.8b //ref_main_idx (row 2)
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
+ add v9.8b, v3.8b , v9.8b //ref_main_idx + 1 (row 2)
+
+ add x20, x4, #8
+ csel x11, x20, x11,le
+ ldr w9, [x8]
+ sxtw x9,w9
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3)
+
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+ st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
+
+ add v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ add x5,x2,x3,lsl#2
+ add x9, x0, x9
+
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
+ add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ xtn v10.8b, v14.8h
+ sshr v14.8h, v14.8h,#5
+
+ add v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5)
+ add v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6)
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+
+ sub x9, x9, #1
+ sqxtn v11.8b, v14.8h
+
+ add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7)
+
+ umull v22.8h, v16.8b, v7.8b //mul (row 5)
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ dup v26.8b,w9
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+
+ add x2, x2, x3, lsl #2
+ add x20, x7, x2
+ csel x2, x20, x2,gt
+ sub x20, x2, x4
+ csel x2, x20, x2,le
+
+ subs x10, x10, #8 //subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+epil_8_16_32:
+
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ rshrn v24.8b, v22.8h,#5 //round shft (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ st1 {v18.8b},[x5], x3 //st (row 7)
+
+
+ b end_func
+
+sz_4_proc:
+ adrp x14, :got:col_for_intra_luma
+ ldr x14, [x14, #:got_lo12:col_for_intra_luma]
+
+ ld1 {v31.8b},[x14]
+ mov x12, #1
+
+ dup v2.8b,w12 //contains #1 for adding to get ref_main_idx + 1
+ mov x0, #2
+
+ dup v3.8b,w0
+ adrp x12, :got:idx_neg_idx_11_17 //load least idx table
+ ldr x12, [x12, #:got_lo12:idx_neg_idx_11_17]
+
+ smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ sub x7, x5, #11
+
+ add x12, x12, x7, lsl #4
+ mov x8, x12
+
+ ldr w9, [x8]
+ sxtw x9,w9
+
+ dup v26.8b,w9 //least idx added to final idx values
+ add x6, sp, x4 //ref_temp + nt
+
+ sub x6, x6, #1 //ref_temp + nt - 1
+ xtn v6.8b, v22.8h
+ add x6, x6, x9
+
+ ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+ mov x0, #31
+
+ dup v29.8b,w0 //contains #31 for vand operation
+ mov x1, #32
+
+ dup v28.8b,w1
+
+ sshr v22.8h, v22.8h,#5
+ sqxtn v8.8b, v22.8h
+
+ and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ add v8.8b, v8.8b , v2.8b //ref_main_idx (add 1)
+ sub v8.8b, v8.8b , v26.8b //ref_main_idx
+ add v9.8b, v8.8b , v2.8b //ref_main_idx + 1
+
+ add v4.8b, v8.8b , v2.8b //row 1 ref_main_idx
+ add v5.8b, v9.8b , v2.8b
+
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ add v8.8b, v8.8b , v3.8b //idx (row 2)
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
+ add v9.8b, v9.8b , v3.8b //idx+1 (row 2)
+
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ rshrn v24.8b, v24.8h,#5 //round shift (row 0)
+
+ add v4.8b, v4.8b , v3.8b //idx (row 3)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ add v5.8b, v5.8b , v3.8b //idx+1 (row 3)
+
+ umull v20.8h, v12.8b, v7.8b //mul (row 2)
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ umlal v20.8h, v13.8b, v6.8b //mul (row 2)
+
+ st1 {v24.s}[0],[x2], x3 //st row 0
+ rshrn v22.8b, v22.8h,#5 //round shift (row 1)
+
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+
+ umull v18.8h, v16.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v17.8b, v6.8b //mul (row 3)
+
+ st1 {v22.s}[0],[x2], x3 //st row 1
+ rshrn v20.8b, v20.8h,#5 //round shift (row 2)
+
+ st1 {v20.s}[0],[x2], x3 //st row 2
+
+ rshrn v18.8b, v18.8h,#5 //round shift (row 3)
+
+ st1 {v18.s}[0],[x2], x3 //st (row 3)
+
+end_func:
+ add sp, sp, #132
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
new file mode 100644
index 0000000..fe7ac11
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@@ -0,0 +1,665 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode_19_to_25.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* naveen sr
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_19_to_25(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_19_to_25_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_luma_mode_19_to_25_av8, %function
+
+ihevc_intra_pred_luma_mode_19_to_25_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x7, :got:gai4_ihevc_ang_table
+ ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+ adrp x8, :got:gai4_ihevc_inv_ang_table
+ ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+ add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode]
+ add x8, x8, x5, lsl #2 //gai4_ihevc_inv_ang_table
+ sub x8, x8, #48 //gai4_ihevc_inv_ang_table[mode - 12]
+
+ ldr w7, [x7] //intra_pred_ang
+ sxtw x7,w7
+ sub sp, sp, #132 //ref_temp[2 * max_cu_size + 1]
+
+ ldr w8, [x8] //inv_ang
+ sxtw x8,w8
+ add x6, sp, x4 //ref_temp + nt
+
+ mul x9, x4, x7 //nt*intra_pred_ang
+
+ sub x6, x6, #1 //ref_temp + nt - 1
+
+ add x1, x0, x4, lsl #1 //x1 = &src[2nt]
+ dup v30.8b,w7 //intra_pred_ang
+
+ mov x7, x4
+
+ asr x9, x9, #5
+
+ ld1 {v0.s}[0],[x1],#4 // pu1_ref[two_nt + k]
+
+ st1 {v0.s}[0],[x6],#4 //ref_temp[k + nt - 1] = pu1_ref[two_nt + k]//
+
+ subs x7, x7, #4
+ beq end_loop_copy
+ sub x1, x1,#4
+ sub x6, x6,#4
+ subs x7,x7,#4
+ beq loop_copy_8
+ subs x7,x7,#8
+ beq loop_copy_16
+
+loop_copy_32:
+ ld1 {v0.8b},[x1],#8
+ ld1 {v1.8b},[x1],#8
+ ld1 {v2.8b},[x1],#8
+ ld1 {v3.8b},[x1],#8
+
+ st1 {v0.8b},[x6],#8
+ st1 {v1.8b},[x6],#8
+ st1 {v2.8b},[x6],#8
+ st1 {v3.8b},[x6],#8
+ b end_loop_copy
+
+loop_copy_16:
+ ld1 {v0.8b},[x1],#8
+ ld1 {v1.8b},[x1],#8
+
+ st1 {v0.8b},[x6],#8
+ st1 {v1.8b},[x6],#8
+ b end_loop_copy
+
+loop_copy_8:
+ ld1 {v0.8b},[x1],#8
+ st1 {v0.8b},[x6],#8
+
+end_loop_copy:
+
+ ldrb w11, [x1]
+ sxtw x11,w11
+ strb w11, [x6]
+ sxtw x11,w11
+
+ cmp x9, #-1
+ bge linear_filtering
+
+ add x6, sp, x4 //ref_temp + nt
+ sub x6, x6, #2 //ref_temp + nt - 2
+
+ mov x12, #-1
+
+ sub x20, x9, x12 //count to take care off ref_idx
+ neg x9, x20
+
+ add x1, x0, x4, lsl #1 //x1 = &src[2nt]
+
+ mov x7, #128 //inv_ang_sum
+
+loop_copy_ref_idx:
+
+ add x7, x7, x8 //inv_ang_sum += inv_ang
+ lsr x14, x7, #8
+ neg x20,x14
+ ldrb w11, [x1, x20]
+ sxtw x11,w11
+// ldrb x11, [x1, -x7, lsr #8]
+ strb w11, [x6], #-1
+ sxtw x11,w11
+
+ subs x9, x9, #1
+
+ bne loop_copy_ref_idx
+
+
+linear_filtering:
+// after copy
+// below code is taken from mode 27 to 33 and modified
+
+ adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+ ldr x6, [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+ add x8,x6,x5,lsl #2 //*gai4_ihevc_ang_table[mode]
+ ldr w9, [x8] //intra_pred_ang = gai4_ihevc_ang_table[mode]
+ sxtw x9,w9
+ adrp x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+ ldr x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+ add x6,x1,#1
+
+ add x8, sp, x4 //ref_temp + nt
+ sub x8, x8,#1 //ref_temp + nt -1
+
+ tst x4,#7
+ mov x14,#0 //row
+ mov x12,x4
+ bne core_loop_4
+
+core_loop_8:
+ add x8,x8,#1 //pu1_ref_main_idx += (two_nt + 1)
+ dup v0.8b,w9 //intra_pred_ang
+ lsr x12, x4, #3 //divide by 8
+
+ movi v1.8b, #32
+ mul x7, x4, x12
+
+ movi v6.8h, #31
+ //lsl x12,x3,#3
+
+ mov x1,x8
+ //sub x12,x12,x4
+ mov x5,x4
+ mov x11,#1
+
+prologue:
+ ld1 {v3.8b},[x6] //loads the row value
+ smull v2.8h, v3.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v5.8b, v2.8h,#5 //idx = pos >> 5
+
+ dup v31.8b, v4.8b[0]
+ add x0,x2,x3
+
+ umov w14, v5.2s[0] //(i row)extract idx to the r register
+ sxtw x14,w14
+
+ dup v29.8b, v4.8b[1] //(ii)
+ sbfx x9,x14,#0,#8
+
+ add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
+
+ ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ sbfx x9,x14,#8,#8
+
+ ld1 {v9.8b},[x10] //(i row)ref_main_idx_1
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ sbfx x9,x14,#16,#8
+ sub v30.8b, v1.8b , v31.8b //32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ sbfx x9,x14,#24,#8
+
+ dup v25.8b, v4.8b[3] //(iv)
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+
+ dup v31.8b, v4.8b[4] //(v)
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ umov w14, v5.2s[1] //extract idx to the r register
+ sxtw x14,w14
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v10.8b},[x2],#8 //(i row)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx x9,x14,#0,#8
+ dup v29.8b, v4.8b[5] //(vi)
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ sbfx x9,x14,#8,#8
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+ dup v27.8b, v4.8b[6] //(vii)
+
+ sbfx x9,x14,#16,#8
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v25.8b, v4.8b[7] //(viii)
+ sbfx x9,x14,#24,#8
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs x4,x4,#8
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x20,x8,#8
+ csel x8, x20, x8,gt
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ sub x20,x7,#8
+ csel x7, x20, x7,gt
+
+ st1 {v10.8b},[x0],x3 //(v)
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ ld1 {v5.8b},[x6] //loads the row value
+ smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+ umov w14, v3.2s[0] //(i)extract idx to the r register
+ sxtw x14,w14
+ sbfx x9,x14,#0,#8
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ dup v31.8b, v4.8b[0]
+ subs x4,x4,#8
+ sbfx x9,x14,#8,#8
+
+ ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+
+ add x20,x6,#8 //increment the row value
+ csel x6, x20, x6,le
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ ld1 {v9.8b},[x10] //(i)ref_main_idx_1
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v5.8b},[x6] //loads the row value
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ dup v29.8b, v4.8b[1] //(ii)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sbfx x9,x14,#16,#8
+
+ st1 {v14.8b},[x0],x3 //(vi)
+ sub v30.8b, v1.8b , v31.8b //(i)32-fract(dup_const_32_fract)
+
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx x9,x14,#24,#8
+ csel x4, x5, x4,le //reload nt
+
+ umov w14, v3.2s[1] //extract idx to the r register
+ sxtw x14,w14
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v25.8b, v4.8b[3] //(iv)
+ smull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+
+ st1 {v22.8b},[x0] //(viii)
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ sbfx x9,x14,#0,#8
+ add x0,x2,x3
+
+ dup v31.8b, v4.8b[4] //(v)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+ sbfx x9,x14,#8,#8
+
+ st1 {v10.8b},[x2],#8 //(i)
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+
+ dup v29.8b, v4.8b[5] //(vi)
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ dup v27.8b, v4.8b[6] //(vii)
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+ sbfx x9,x14,#16,#8
+
+ dup v25.8b, v4.8b[7] //(viii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+ sbfx x9,x14,#24,#8
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ umov w14, v3.2s[0] //(i)extract idx to the r register
+ sxtw x14,w14
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+ csel x8, x1, x8,le //reload the source to pu1_src+2nt
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+
+ add x20,x8,#8 //increment the source next set 8 columns in same row
+ csel x8, x20, x8,gt
+ lsl x20, x3,#3
+ csel x12,x20,x12,le
+ sub x20,x12,x5
+ csel x12, x20, x12,le
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ st1 {v10.8b},[x0],x3 //(v)
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ add x20,x2,x12 //increment the dst pointer to 8*dst_strd - nt
+ csel x2, x20, x2,le
+ sbfx x9,x14,#0,#8
+
+ xtn v4.8b, v4.8h
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ subs x7,x7,#8
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ st1 {v14.8b},[x0],x3 //(vi)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ st1 {v22.8b},[x0],x3 //(viii)
+ b end_loops
+
+core_loop_4:
+ add x6,x8,#1 //pu1_ref_main_idx += 1
+ mov x8,#0
+
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ asr x14, x5, #5 //if(fract_prev > fract)
+ and x5,x5,#31 //fract = pos & (31)
+ add x10,x6,x14 //pu1_ref_main_idx += 1
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+ dup v0.8b,w5 //dup_const_fract
+ sub x20,x5,#32
+ neg x4, x20
+ dup v1.8b,w4 //dup_const_32_fract
+
+//inner_loop_4
+ ld1 {v2.s}[0],[x10] //ref_main_idx
+ add x8,x8,#1
+// mov x14,x5 @fract_prev = fract
+
+ ld1 {v3.s}[0],[x11] //ref_main_idx_1
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ asr x14, x5, #5 // pos >> 5
+ and x5,x5,#31 //fract = pos & (31)
+ add x10,x6,x14 //pu1_ref_main_idx += 1
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+
+ dup v6.8b,w5 //dup_const_fract
+ umull v4.8h, v2.8b, v1.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v7.8b,w4 //dup_const_32_fract
+ umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v8.s}[0],[x10] //ref_main_idx
+ add x8,x8,#1
+
+ ld1 {v9.s}[0],[x11] //ref_main_idx_1
+ rshrn v4.8b, v4.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+// mov x14,x5 @fract_prev = fract
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ asr x14, x5, #5 //if(fract_prev > fract)
+ and x5,x5,#31 //fract = pos & (31)
+ add x10,x6,x14 //ref_main + idx
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+
+ dup v12.8b,w5 //dup_const_fract
+ umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v13.8b,w4 //dup_const_32_fract
+ umlal v10.8h, v9.8b, v6.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v14.s}[0],[x10] //ref_main_idx
+ add x8,x8,#1
+
+ st1 {v4.s}[0],[x2],x3
+ rshrn v10.8b, v10.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v15.s}[0],[x11] //ref_main_idx_1
+// mov x14,x5 @fract_prev = fract
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ asr x14, x5, #5 //if(fract_prev > fract)
+ and x5,x5,#31 //fract = pos & (31)
+ add x10,x6,x14 //pu1_ref_main_idx += 1
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+
+ dup v18.8b,w5 //dup_const_fract
+ umull v16.8h, v14.8b, v13.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v19.8b,w4 //dup_const_32_fract
+ umlal v16.8h, v15.8b, v12.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v20.s}[0],[x10] //ref_main_idx
+
+ st1 {v10.s}[0],[x2],x3
+ rshrn v16.8b, v16.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+ ld1 {v21.s}[0],[x11] //ref_main_idx_1
+
+ umull v22.8h, v20.8b, v19.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+ umlal v22.8h, v21.8b, v18.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v16.s}[0],[x2],x3
+ rshrn v22.8b, v22.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ st1 {v22.s}[0],[x2],x3
+
+end_loops:
+ add sp, sp, #132
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_dc.s b/common/arm64/ihevc_intra_pred_luma_dc.s
new file mode 100644
index 0000000..7683266
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_dc.s
@@ -0,0 +1,519 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_filters_dc.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevc_intra_pred_luma_dc_av8
+
+.type ihevc_intra_pred_luma_dc_av8, %function
+
+ihevc_intra_pred_luma_dc_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+
+//********** testing
+ //mov x6, #128
+ //b prologue_cpy_32
+//********** testing
+
+ mov x11, #2 //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val)
+ mov x9, #0
+ mov v17.s[0], w11
+ mov v17.s[1], w9
+
+ clz w5,w4
+
+ add x6, x0, x4 //&src[nt]
+ sub x20, x5, #32 //log2nt
+ neg x5, x20
+ add x7, x0, x4, lsl #1 //&src[2nt]
+
+ add x8, x7, #1 //&src[2nt+1]
+ mvn x5, x5
+ add x5, x5, #1
+ dup v8.2s,w5
+
+ ldrb w14, [x8]
+ sxtw x14,w14
+ shl d8, d8,#32
+
+ sub x9, x7, #1 //&src[2nt-1]
+ sshr d8, d8,#32
+
+ mov x7, x8 //x7 also stores 2nt+1
+
+ ldrb w12, [x9]
+ sxtw x12,w12
+ add x14, x14, x12 //src[2nt+1] + src[2nt-1]
+ add x14, x14, x11 //src[2nt+1] + src[2nt-1] + 2
+
+ cmp x4, #4
+ beq dc_4
+
+ mov x10, x4 //nt
+
+add_loop:
+ ld1 {v0.8b},[x6],#8 //load from src[nt]
+ mov x5, #0 //
+ ld1 {v1.8b},[x8],#8 //load from src[2nt+1]
+
+ uaddlp v2.4h, v0.8b
+
+ mov v6.s[0], w4
+ mov v6.s[1], w5 //store nt to accumulate
+ uaddlp v3.4h, v1.8b
+
+ ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 8)
+
+ ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 8)
+ add v4.4h, v2.4h , v3.4h
+
+
+ uaddlp v5.2s, v4.4h
+
+
+ uadalp v6.1d, v5.2s //accumulate all inp into d6 (end for nt==8)
+
+ subs x10, x10,#8
+ beq epil_add_loop
+
+core_loop_add:
+ uaddlp v2.4h, v0.8b
+ subs x10, x10,#8
+ uaddlp v3.4h, v1.8b
+
+
+
+ add v4.4h, v2.4h , v3.4h
+ ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 16)
+
+ uaddlp v5.2s, v4.4h
+ ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 16)
+
+ uadalp v6.1d, v5.2s //accumulate all inp into d6
+ bne core_loop_add
+
+epil_add_loop:
+
+ sshl d9, d6, d8 //(dc_val) shr by log2nt+1
+ cmp x4, #32
+
+ mov v28.s[0], w14
+ mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28
+ mov x20,#128
+ csel x6, x20, x6,eq
+
+ dup v16.8b, v9.8b[0] //dc_val
+ shl d13, d9,#1 //2*dc
+
+ beq prologue_cpy_32
+
+ add d14, d13 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
+ mov x20,#0
+ csel x6, x20, x6,ne //nt
+
+ ushr v15.4h, v14.4h,#2 //final dst[0]'s value in d15[0]
+ csel x10, x4, x10,ne
+
+ add d11, d13 , d9 //3*dc
+ sub x12, x3, x3, lsl #3 //-7*strd
+
+ add d11, d11 , d17 //3*dc + 2
+ add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8)
+
+ dup v24.8h, v11.4h[0] //3*dc + 2 (moved to all lanes)
+ sub x0, x3, x4 //strd - nt
+
+prologue_col:
+ //0th column and 0-7 rows done here
+ //x8 and x9 (2nt+1+col 2nt-1-row)
+
+ mov x8, x7 //&src[2nt+1]
+
+ add x0, x0, #8 //strd - nt + 8
+ ld1 {v0.8b},[x8],#8 //col 1::7 load (prol)
+ sub x9, x9, #7 //&src[2nt-1-row]
+
+ ld1 {v1.8b},[x9] //row 7::1 (0 also) load (prol)
+ sub x9, x9, #8
+
+ uxtl v20.8h, v0.8b
+
+ ld1 {v6.8b},[x8] //col 8::15 load (prol extra)
+ add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol)
+
+ uxtl v22.8h, v1.8b
+ sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol)
+
+ uxtl v26.8h, v6.8b
+ add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol)
+
+ movi d19, #0x00000000000000ff //
+ sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
+
+ bsl v19.8b, v15.8b , v2.8b //first row with dst[0]
+ add v26.8h, v26.8h , v24.8h //col 8::15 add 3dc+2 (prol extra)
+
+ rev64 v3.8b, v3.8b
+
+ st1 {v19.8b},[x2], x3 //store row 0 (prol)
+ sshr d3, d3,#8 //row 0 shift (prol) (first value to be ignored)
+
+ movi d20, #0x00000000000000ff //byte mask row 1 (prol)
+
+loop_again_col_row:
+
+ bsl v20.8b, v3.8b , v16.8b //row 1 (prol)
+
+ movi d21, #0x00000000000000ff //byte mask row 2 (prol)
+ sshr d3, d3,#8 //row 1 shift (prol)
+
+ st1 {v20.8b},[x2], x3 //store row 1 (prol)
+ sqshrun v4.8b, v26.8h,#2 //columns shx2 movn (prol extra)
+
+
+ bsl v21.8b, v3.8b , v16.8b //row 2 (prol)
+
+ movi d20, #0x00000000000000ff //byte mask row 3 (prol)
+ sshr d3, d3,#8 //row 2 shift (prol)
+
+ st1 {v21.8b},[x2], x3 //store row 2 (prol)
+
+
+ bsl v20.8b, v3.8b , v16.8b //row 3 (prol)
+
+ movi d21, #0x00000000000000ff //byte mask row 4 (prol)
+ sshr d3, d3,#8 //row 3 shift (prol)
+
+ st1 {v20.8b},[x2], x3 //store row 3 (prol)
+
+
+ bsl v21.8b, v3.8b , v16.8b //row 4 (prol)
+
+ movi d20, #0x00000000000000ff //byte mask row 5 (prol)
+ sshr d3, d3,#8 //row 4 shift (prol)
+
+ st1 {v21.8b},[x2], x3 //store row 4 (prol)
+
+
+ bsl v20.8b, v3.8b , v16.8b //row 5 (prol)
+
+ movi d21, #0x00000000000000ff //byte mask row 6 (prol)
+ sshr d3, d3,#8 //row 5 shift (prol)
+
+ st1 {v20.8b},[x2], x3 //store row 5 (prol)
+
+ ld1 {v1.8b},[x9] //row 8::15 load (prol extra)
+
+ bsl v21.8b, v3.8b , v16.8b //row 6 (prol)
+
+ uxtl v22.8h, v1.8b
+
+ movi d20, #0x00000000000000ff //byte mask row 7 (prol)
+ sshr d3, d3,#8 //row 6 shift (prol)
+
+ st1 {v21.8b},[x2], x3 //store row 6 (prol)
+
+ bsl v20.8b, v3.8b , v16.8b //row 7 (prol)
+ add v22.8h, v22.8h , v24.8h //row 8::15 add 3dc+2 (prol extra)
+
+ sshr d3, d3,#8 //row 7 shift (prol)
+ st1 {v20.8b},[x2], x12 //store row 7 (prol)
+
+ subs x10, x10, #8 //counter for cols
+
+ beq end_func
+ blt copy_16
+
+
+ movi d20, #0x00000000000000ff //byte mask row 9 (prol)
+ sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
+
+ rev64 v3.8b, v3.8b
+
+ st1 {v4.8b},[x2], x3 //store 2nd col (for 16x16)
+
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x0 //go to next row for 16
+
+
+ bsl v20.8b, v3.8b , v16.8b //row 9 (prol)
+ subs x10, x10, #8
+
+ st1 {v20.8b},[x2], x3 //store row 9 (prol)
+ sshr d3, d3,#8 //row 9 shift (prol)
+
+ movi d20, #0x00000000000000ff //byte mask row 9 (prol)
+
+ b loop_again_col_row
+
+
+copy_16:
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2], x3
+ st1 {v16.8b},[x2]
+
+ b end_func
+
+prologue_cpy_32:
+ mov x9, #128
+ //sub x7, x3, #-24
+ add x5, x2, x3
+ add x8, x5, x3
+ add x10, x8, x3
+ dup v20.16b, v16.8b[0]
+ lsl x6, x3, #2
+ add x6, x6, #-16
+
+ st1 {v20.16b}, [x2],#16
+ st1 {v20.16b}, [x5],#16
+ st1 {v20.16b}, [x8],#16
+ st1 {v20.16b}, [x10],#16
+
+ st1 {v20.16b}, [x2], x6
+ st1 {v20.16b}, [x5], x6
+ st1 {v20.16b}, [x8], x6
+ st1 {v20.16b}, [x10], x6
+
+ sub x9, x9, #32 //32x32 prol/epil counter dec
+
+kernel_copy:
+ st1 {v20.16b}, [x2],#16
+ st1 {v20.16b}, [x5],#16
+ st1 {v20.16b}, [x8],#16
+ st1 {v20.16b}, [x10],#16
+
+ st1 {v20.16b}, [x2], x6
+ st1 {v20.16b}, [x5], x6
+ st1 {v20.16b}, [x8], x6
+ st1 {v20.16b}, [x10], x6
+
+ subs x9, x9, #32
+
+ st1 {v20.16b}, [x2],#16
+ st1 {v20.16b}, [x5],#16
+ st1 {v20.16b}, [x8],#16
+ st1 {v20.16b}, [x10],#16
+
+ st1 {v20.16b}, [x2], x6
+ st1 {v20.16b}, [x5], x6
+ st1 {v20.16b}, [x8], x6
+ st1 {v20.16b}, [x10], x6
+
+ bne kernel_copy
+
+epilogue_copy:
+ st1 {v20.16b}, [x2],#16
+ st1 {v20.16b}, [x5],#16
+ st1 {v20.16b}, [x8],#16
+ st1 {v20.16b}, [x10],#16
+
+ st1 {v20.16b}, [x2]
+ st1 {v20.16b}, [x5]
+ st1 {v20.16b}, [x8]
+ st1 {v20.16b}, [x10]
+
+ b end_func
+
+
+dc_4:
+ ld1 {v0.8b},[x6],#8 //load from src[nt]
+ ld1 {v1.8b},[x8],#8 //load from src[2nt+1]
+
+ uaddlp v2.4h, v0.8b
+ mov x5, #0 //
+ mov v6.s[0], w4
+ mov v6.s[1], w5 //store nt to accumulate
+ uaddlp v3.4h, v1.8b
+
+ add v4.4h, v2.4h , v3.4h
+
+
+ uaddlp v5.2s, v4.4h
+ movi d30, #0x00000000ffffffff
+
+ and v5.8b, v5.8b , v30.8b
+
+ mov v28.s[0], w14
+ mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28
+ add d6, d6 , d5 //accumulate all inp into d6 (end for nt==8)
+
+ sshl d9, d6, d8 //(dc_val) shr by log2nt+1
+ mov x8, x7 //&src[2nt+1]
+
+ shl d13, d9,#1 //2*dc
+ sub x9, x9, #3 //&src[2nt-1-row]
+
+ dup v16.8b, v9.8b[0] //dc_val
+ add d14, d13 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
+
+ ushr v15.4h, v14.4h,#2 //final dst[0]'s value in d15[0]
+ sub x12, x3, x3, lsl #2 //-3*strd
+ add d11, d13 , d9 //3*dc
+
+ add d11, d11 , d17 //3*dc + 2
+ add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4)
+
+ dup v24.8h, v11.4h[0] //3*dc + 2 (moved to all lanes)
+ sub x0, x3, x4 //strd - nt
+
+
+ ld1 {v0.8b},[x8] //col 1::3 load (prol)
+ ld1 {v1.8b},[x9] //row 3::1 (0 also) load (prol)
+
+ uxtl v20.8h, v0.8b
+
+ uxtl v22.8h, v1.8b
+ add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol)
+
+ add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol)
+
+ movi d19, #0x00000000000000ff //
+ sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol)
+
+ movi d20, #0x00000000000000ff //byte mask row 1 (prol)
+ sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
+
+
+ bsl v19.8b, v15.8b , v2.8b //first row with dst[0]
+
+ rev64 v3.8b, v3.8b
+
+ st1 {v19.s}[0],[x2], x3 //store row 0 (prol)
+ sshr d3, d3,#40 //row 0 shift (prol) (first value to be ignored)
+
+ movi d21, #0x00000000000000ff //byte mask row 2 (prol)
+
+ bsl v20.8b, v3.8b , v16.8b //row 1 (prol)
+ sshr d3, d3,#8 //row 1 shift (prol)
+
+ st1 {v20.s}[0],[x2], x3 //store row 1 (prol)
+
+ bsl v21.8b, v3.8b , v16.8b //row 2 (prol)
+
+ movi d20, #0x00000000000000ff //byte mask row 3 (prol)
+
+ sshr d3, d3,#8 //row 2 shift (prol)
+ st1 {v21.s}[0],[x2], x3 //store row 2 (prol)
+
+ bsl v20.8b, v3.8b , v16.8b //row 3 (prol)
+ st1 {v20.s}[0],[x2] //store row 3 (prol)
+
+epilogue_end:
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_horz.s b/common/arm64/ihevc_intra_pred_luma_horz.s
new file mode 100644
index 0000000..551fd77
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_horz.s
@@ -0,0 +1,357 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_horz_neon.s
+//*
+//* @brief
+//* contains function definition for intra prediction interpolation filters
+//*
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//* - ihevc_intra_pred_luma_horz()
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* intra prediction interpolation filter for horizontal luma variable.
+//*
+//* @par description:
+//* horizontal intraprediction(mode 10) with.extern samples location
+//* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer
+//* to section 8.4.4.2.6 in the standard (special case)
+//*
+//* @param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* integer transform block size
+//*
+//* @param[in] mode
+//* integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_horz_av8
+
+.type ihevc_intra_pred_luma_horz_av8, %function
+
+ihevc_intra_pred_luma_horz_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ //ldr x5,[sp,#44] @loads mode
+
+ lsl x6,x4,#1 //two_nt
+
+ add x12,x0,x6 //*pu1_ref[two_nt]
+ cmp x4,#4 //if nt == 4
+ beq core_loop_4
+
+ cmp x4,#8 //if nt == 8
+ beq core_loop_8
+
+ cmp x4,#16 //if nt == 16
+ beq core_loop_16
+ sub x12,x12,#16 //move to 16th value pointer
+ add x9,x2,#16
+
+core_loop_32:
+ ld1 { v0.16b},[x12] //load 16 values. d1[7] will have the 1st value.
+
+ dup v2.16b, v0.16b[15] //duplicate the i value.
+
+ dup v4.16b, v0.16b[14] //duplicate the ii value.
+ dup v6.16b, v0.16b[13] //duplicate the iii value.
+ st1 { v2.16b},[x2],x3 //store in 1st row 0-16 columns
+ st1 { v2.16b},[x9],x3 //store in 1st row 16-32 columns
+
+ dup v8.16b, v0.16b[12]
+ st1 { v4.16b},[x2],x3
+ st1 { v4.16b},[x9],x3
+
+ dup v2.16b, v0.16b[11]
+ st1 { v6.16b},[x2],x3
+ st1 { v6.16b},[x9],x3
+
+ dup v4.16b, v0.16b[10]
+ st1 { v8.16b},[x2],x3
+ st1 { v8.16b},[x9],x3
+
+ dup v6.16b, v0.16b[9]
+ st1 { v2.16b},[x2],x3
+ st1 { v2.16b},[x9],x3
+
+ dup v8.16b, v0.16b[8]
+ st1 { v4.16b},[x2],x3
+ st1 { v4.16b},[x9],x3
+
+ dup v2.16b, v0.8b[7]
+ st1 { v6.16b},[x2],x3
+ st1 { v6.16b},[x9],x3
+
+ dup v4.16b, v0.8b[6]
+ st1 { v8.16b},[x2],x3
+ st1 { v8.16b},[x9],x3
+
+ dup v6.16b, v0.8b[5]
+ st1 { v2.16b},[x2],x3
+ st1 { v2.16b},[x9],x3
+
+ dup v8.16b, v0.8b[4]
+ st1 { v4.16b},[x2],x3
+ st1 { v4.16b},[x9],x3
+
+ dup v2.16b, v0.8b[3]
+ st1 { v6.16b},[x2],x3
+ st1 { v6.16b},[x9],x3
+
+ dup v4.16b, v0.8b[2]
+ st1 { v8.16b},[x2],x3
+ st1 { v8.16b},[x9],x3
+
+ dup v6.16b, v0.8b[1]
+ st1 { v2.16b},[x2],x3
+ st1 { v2.16b},[x9],x3
+ sub x12,x12,#16 //move to 16th value pointer
+
+ dup v8.16b, v0.8b[0]
+ st1 { v4.16b},[x2],x3
+ st1 { v4.16b},[x9],x3
+
+ subs x4,x4,#16 //decrement the loop count by 16
+ st1 { v6.16b},[x2],x3
+ st1 { v6.16b},[x9],x3
+
+ st1 { v8.16b},[x2],x3
+ st1 { v8.16b},[x9],x3
+ bgt core_loop_32
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+ b end_func
+
+core_loop_16:
+ ldrb w14,[x12],#1 //pu1_ref[two_nt]
+ sxtw x14,w14
+ ld1 { v30.8b},[x12],#8 //pu1_ref[two_nt + 1 + col]
+ ld1 { v31.8b},[x12] //pu1_ref[two_nt + 1 + col]
+ sub x12,x12,#8
+
+ dup v28.8b,w14
+ sub x12,x12,#17
+ ld1 { v0.16b},[x12]
+ dup v26.8b, v0.16b[15]
+ uxtl v26.8h, v26.8b
+
+ dup v2.16b, v0.16b[14]
+ usubl v24.8h, v30.8b, v28.8b
+
+ dup v4.16b, v0.16b[13]
+ sshr v24.8h, v24.8h,#1
+
+ dup v6.16b, v0.16b[12]
+ sqadd v22.8h, v26.8h , v24.8h
+
+ dup v8.16b, v0.16b[11]
+ sqxtun v22.8b, v22.8h
+
+ st1 {v22.8b},[x2],#8
+
+ dup v10.16b, v0.16b[10]
+ usubl v24.8h, v31.8b, v28.8b
+
+ dup v12.16b, v0.16b[9]
+ sshr v24.8h, v24.8h,#1
+
+ dup v14.16b, v0.16b[8]
+ sqadd v22.8h, v26.8h , v24.8h
+
+ dup v16.16b, v0.8b[7]
+ sqxtun v22.8b, v22.8h
+
+ st1 {v22.8b},[x2],x3
+ sub x2,x2,#8
+
+ st1 { v2.16b},[x2],x3
+
+ st1 { v4.16b},[x2],x3
+ st1 { v6.16b},[x2],x3
+ st1 { v8.16b},[x2],x3
+
+ dup v2.16b, v0.8b[6]
+ st1 { v10.16b},[x2],x3
+
+ dup v4.16b, v0.8b[5]
+ st1 { v12.16b},[x2],x3
+
+ dup v6.16b, v0.8b[4]
+ st1 { v14.16b},[x2],x3
+
+ dup v8.16b, v0.8b[3]
+ st1 { v16.16b},[x2],x3
+
+ dup v10.16b, v0.8b[2]
+ st1 { v2.16b},[x2],x3
+
+ dup v12.16b, v0.8b[1]
+ st1 { v4.16b},[x2],x3
+
+ dup v14.16b, v0.8b[0]
+ st1 { v6.16b},[x2],x3
+
+ st1 { v8.16b},[x2],x3
+ st1 { v10.16b},[x2],x3
+ st1 { v12.16b},[x2],x3
+ st1 { v14.16b},[x2],x3
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+ b end_func
+
+
+core_loop_8:
+ ldrb w14,[x12] //pu1_ref[two_nt]
+ sxtw x14,w14
+ add x12,x12,#1 //pu1_ref[two_nt + 1]
+ ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col]
+
+ sub x12,x12,#9
+ ld1 {v0.8b},[x12]
+ dup v26.8b, v0.8b[7]
+ dup v28.8b,w14
+
+ dup v3.8b, v0.8b[6]
+ uxtl v26.8h, v26.8b
+
+ dup v4.8b, v0.8b[5]
+ usubl v24.8h, v30.8b, v28.8b
+
+ dup v5.8b, v0.8b[4]
+ sshr v24.8h, v24.8h,#1
+
+ dup v6.8b, v0.8b[3]
+ sqadd v22.8h, v26.8h , v24.8h
+
+ dup v7.8b, v0.8b[2]
+ sqxtun v22.8b, v22.8h
+
+ st1 {v22.8b},[x2],x3
+ st1 {v3.8b},[x2],x3
+
+ dup v8.8b, v0.8b[1]
+ st1 {v4.8b},[x2],x3
+ st1 {v5.8b},[x2],x3
+
+ dup v9.8b, v0.8b[0]
+ st1 {v6.8b},[x2],x3
+ st1 {v7.8b},[x2],x3
+
+ st1 {v8.8b},[x2],x3
+ st1 {v9.8b},[x2],x3
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+ b end_func
+
+
+core_loop_4:
+ ldrb w14,[x12] //pu1_ref[two_nt]
+ sxtw x14,w14
+ add x12,x12,#1 //pu1_ref[two_nt + 1]
+ ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col]
+
+ sub x12,x12,#5
+ ld1 {v0.8b},[x12]
+ dup v28.8b,w14
+ dup v26.8b, v0.8b[3]
+ uxtl v26.8h, v26.8b
+
+ dup v3.8b, v0.8b[2]
+ usubl v24.8h, v30.8b, v28.8b
+
+ dup v4.8b, v0.8b[1]
+ sshr v24.8h, v24.8h,#1
+
+ dup v5.8b, v0.8b[0]
+ sqadd v22.8h, v26.8h , v24.8h
+
+ sqxtun v22.8b, v22.8h
+
+ st1 {v22.s}[0],[x2],x3
+ st1 {v3.s}[0],[x2],x3
+ st1 {v4.s}[0],[x2],x3
+ st1 {v5.s}[0],[x2],x3
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+end_func:
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_mode2.s b/common/arm64/ihevc_intra_pred_luma_mode2.s
new file mode 100644
index 0000000..5d7a3c5
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode2.s
@@ -0,0 +1,280 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode2_neon.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode2_av8
+
+.type ihevc_intra_pred_luma_mode2_av8, %function
+
+ihevc_intra_pred_luma_mode2_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x8,#-2
+
+ cmp x4,#4
+ beq mode2_4
+
+ add x0,x0,x4,lsl #1
+
+ sub x0,x0,#9 //src[1]
+ add x10,x0,#-1
+
+prologue_cpy_32:
+
+ ld1 {v0.8b},[x0],x8
+ mov x11,x4
+
+ ld1 {v1.8b},[x10],x8
+ mov x6, x2
+
+ ld1 {v2.8b},[x0],x8
+ ld1 {v3.8b},[x10],x8
+ lsr x1, x4, #3
+
+ ld1 {v4.8b},[x0],x8
+ ld1 {v5.8b},[x10],x8
+ ld1 {v6.8b},[x0],x8
+ mul x1, x4, x1
+
+ ld1 {v7.8b},[x10],x8
+ add x7,x6,x3
+
+ rev64 v8.8b, v0.8b
+ rev64 v9.8b, v1.8b
+ lsl x5, x3, #2
+
+ rev64 v10.8b, v2.8b
+ rev64 v11.8b, v3.8b
+ add x9,x7,x3
+
+ rev64 v12.8b, v4.8b
+ subs x1,x1,#8
+
+ rev64 v13.8b, v5.8b
+ rev64 v14.8b, v6.8b
+ rev64 v15.8b, v7.8b
+ add x14,x9,x3
+
+ beq epilogue_mode2
+
+ sub x12,x4,#8
+
+kernel_mode2:
+
+ st1 {v8.8b},[x6],x5
+ st1 {v9.8b},[x7],x5
+ subs x11,x11,#8
+
+ st1 {v10.8b},[x9],x5
+ add x20,x2,#8
+ csel x2, x20, x2,gt
+
+ st1 {v11.8b},[x14],x5
+ st1 {v12.8b},[x6],x5
+ csel x11, x4, x11,le
+
+ st1 {v13.8b},[x7],x5
+ st1 {v14.8b},[x9],x5
+ add x20, x2, x3, lsl #2
+ csel x2, x20, x2,le
+
+ st1 {v15.8b},[x14],x5
+ ld1 {v0.8b},[x0],x8
+ sub x14,x4,#8
+
+ ld1 {v1.8b},[x10],x8
+ ld1 {v2.8b},[x0],x8
+ add x20, x2, #8
+ csel x2, x20, x2,le
+
+ ld1 {v3.8b},[x10],x8
+ ld1 {v4.8b},[x0],x8
+ sub x20, x6, x14
+ csel x2, x20, x2,le
+
+ ld1 {v5.8b},[x10],x8
+ subs x12,x12,#8
+
+ ld1 {v6.8b},[x0],x8
+ mov x6, x2
+
+ ld1 {v7.8b},[x10],x8
+ add x20, x0, x4
+ csel x0, x20, x0,le
+
+ rev64 v8.8b, v0.8b
+ add x7, x6, x3
+
+ rev64 v9.8b, v1.8b
+ sub x20, x0, #8
+ csel x0, x20, x0,le
+
+ rev64 v10.8b, v2.8b
+ csel x12, x4, x12,le
+
+ rev64 v11.8b, v3.8b
+ add x9, x7, x3
+
+ rev64 v12.8b, v4.8b
+ add x10,x0,#-1
+
+ rev64 v13.8b, v5.8b
+ subs x1, x1, #8
+
+ rev64 v14.8b, v6.8b
+ add x14, x9, x3
+
+ rev64 v15.8b, v7.8b
+
+ bne kernel_mode2
+
+epilogue_mode2:
+
+ st1 {v8.8b},[x6],x5
+ st1 {v9.8b},[x7],x5
+ st1 {v10.8b},[x9],x5
+ st1 {v11.8b},[x14],x5
+ st1 {v12.8b},[x6],x5
+ st1 {v13.8b},[x7],x5
+ st1 {v14.8b},[x9],x5
+ st1 {v15.8b},[x14],x5
+
+ b end_func
+
+mode2_4:
+
+ mov x8,#-2
+ sub x0,x0,#1
+ add x10,x0,#-1
+
+ ld1 {v0.8b},[x0],x8
+ add x5,x2,x3
+ ld1 {v2.8b},[x10],x8
+ add x6,x5,x3
+ ld1 {v4.8b},[x0]
+ add x7,x6,x3
+ ld1 {v6.8b},[x10]
+
+ rev64 v1.8b, v0.8b
+ rev64 v3.8b, v2.8b
+
+
+
+ st1 {v1.s}[0],[x2]
+ rev64 v5.8b, v4.8b
+ st1 {v3.s}[0],[x5]
+ rev64 v7.8b, v6.8b
+ st1 {v5.s}[0],[x6]
+ st1 {v7.s}[0],[x7]
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_18_34.s b/common/arm64/ihevc_intra_pred_luma_mode_18_34.s
new file mode 100644
index 0000000..11e1792
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode_18_34.s
@@ -0,0 +1,288 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode_18_34_neon.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* yogeswaran rs
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_18_34_av8
+
+.type ihevc_intra_pred_luma_mode_18_34_av8, %function
+
+ihevc_intra_pred_luma_mode_18_34_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ cmp x4,#4
+ beq mode2_4
+
+ mov x11,x4
+ mov x12,x4
+ sub x14,x4,#8
+
+ add x0,x0,x4,lsl #1
+
+ cmp x5,#0x22
+ mov x10,x2
+
+ add x0,x0,#2
+ sub x20,x0,#2
+ csel x0, x20, x0,ne
+ mov x20,#1
+ csel x6, x20, x6,eq
+ mov x20,#-1
+ csel x6, x20, x6,ne
+ mov x8,x0
+
+prologue_cpy_32:
+
+ ld1 {v0.8b},[x8],x6
+ lsr x1, x4, #3
+ ld1 {v1.8b},[x8],x6
+ mul x1, x4, x1
+ ld1 {v2.8b},[x8],x6
+ ld1 {v3.8b},[x8],x6
+ subs x1,x1,#8
+ ld1 {v4.8b},[x8],x6
+ ld1 {v5.8b},[x8],x6
+ ld1 {v6.8b},[x8],x6
+
+ ld1 {v7.8b},[x8],x6
+
+
+ beq epilogue_mode2
+ sub x11,x11,#8
+
+ cmp x5,#0x22
+ add x20,x0,#8
+ csel x0, x20, x0,ne
+ csel x8, x0, x8,ne
+ bne kernel_mode18
+ //add x8,x0,#8
+
+kernel_mode2:
+ st1 {v0.8b},[x10],x3
+ st1 {v1.8b},[x10],x3
+ subs x12,x12,#8
+ st1 {v2.8b},[x10],x3
+ add x20,x2,#8
+ csel x2, x20, x2,ne
+ st1 {v3.8b},[x10],x3
+
+ ld1 {v0.8b},[x8],x6
+ st1 {v4.8b},[x10],x3
+
+ st1 {v5.8b},[x10],x3
+ ld1 {v1.8b},[x8],x6
+ st1 {v6.8b},[x10],x3
+ ld1 {v2.8b},[x8],x6
+ st1 {v7.8b},[x10],x3
+
+ ld1 {v3.8b},[x8],x6
+ sub x20,x10,x14
+ csel x2, x20, x2,eq
+ ld1 {v4.8b},[x8],x6
+ mov x10,x2
+ ld1 {v5.8b},[x8],x6
+ csel x12, x4, x12,eq
+ ld1 {v6.8b},[x8],x6
+ subs x11,x11,#8
+
+ ld1 {v7.8b},[x8],x6
+
+ add x20,x0,#8
+ csel x0, x20, x0,eq
+ csel x11, x4, x11,eq
+ csel x8, x0, x8,eq
+
+ subs x1, x1, #8
+
+ bne kernel_mode2
+
+ b epilogue_mode2
+
+kernel_mode18:
+ st1 {v0.8b},[x10],x3
+ st1 {v1.8b},[x10],x3
+ subs x12,x12,#8
+ st1 {v2.8b},[x10],x3
+ add x20,x2,#8
+ csel x2, x20, x2,ne
+ st1 {v3.8b},[x10],x3
+
+ ld1 {v0.8b},[x8],x6
+ st1 {v4.8b},[x10],x3
+
+ st1 {v5.8b},[x10],x3
+ ld1 {v1.8b},[x8],x6
+
+ st1 {v6.8b},[x10],x3
+ ld1 {v2.8b},[x8],x6
+ st1 {v7.8b},[x10],x3
+
+ ld1 {v3.8b},[x8],x6
+ sub x20,x10,x14
+ csel x2, x20, x2,eq
+ ld1 {v4.8b},[x8],x6
+ mov x10,x2
+ ld1 {v5.8b},[x8],x6
+ csel x12, x4, x12,eq
+ ld1 {v6.8b},[x8],x6
+ subs x11,x11,#8
+ ld1 {v7.8b},[x8],x6
+
+ add x20,x0,#8
+ csel x0, x20, x0,ne
+ csel x11, x4, x11,eq
+ sub x20,x8,x14
+ csel x0, x20, x0,eq
+ subs x1, x1, #8
+ mov x8,x0
+
+ bne kernel_mode18
+
+
+epilogue_mode2:
+
+ st1 {v0.8b},[x10],x3
+ st1 {v1.8b},[x10],x3
+ st1 {v2.8b},[x10],x3
+ st1 {v3.8b},[x10],x3
+ st1 {v4.8b},[x10],x3
+ st1 {v5.8b},[x10],x3
+ st1 {v6.8b},[x10],x3
+ st1 {v7.8b},[x10],x3
+
+ b end_func
+
+mode2_4:
+
+ add x0,x0,#10
+ cmp x5,#0x22
+ sub x20,x0,#2
+ csel x0, x20, x0,ne
+
+ mov x20,#1
+ csel x8, x20, x8,eq
+ mov x20,#-1
+ csel x8, x20, x8,ne
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.s}[0],[x2],x3
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.s}[0],[x2],x3
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.s}[0],[x2],x3
+
+ ld1 {v0.8b},[x0],x8
+ st1 {v0.s}[0],[x2],x3
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
new file mode 100644
index 0000000..79964f7
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
@@ -0,0 +1,555 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode_27_to_33.s
+//*
+//* @brief
+//* contains function definition for intra prediction interpolation filters
+//*
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//* - ihevc_intra_pred_luma_mode_27_to_33()
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* intra prediction interpolation filter for luma mode 27 to mode 33
+//*
+//* @par description:
+//* intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+//* .extern neighboring samples location pointed by 'pu1_ref' to the tu
+//* block location pointed by 'pu1_dst'
+//*
+//* @param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* integer transform block size
+//*
+//* @param[in] mode
+//* integer intraprediction mode
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
+// word32 src_strd,
+// uword8 *pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_27_to_33_av8
+.extern gai4_ihevc_ang_table
+.extern gau1_ihevc_planar_factor
+
+.type ihevc_intra_pred_luma_mode_27_to_33_av8, %function
+
+ihevc_intra_pred_luma_mode_27_to_33_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
+ ldr x6, [x6, #:got_lo12:gai4_ihevc_ang_table]
+
+ lsl x7,x4,#1 //two_nt
+
+ add x8,x6,x5,lsl #2 //*gai4_ihevc_ang_table[mode]
+ ldr w9, [x8] //intra_pred_ang = gai4_ihevc_ang_table[mode]
+ sxtw x9,w9
+ adrp x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
+ ldr x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
+ add x6,x1,#1
+
+ tst x4,#7
+ add x8,x0,x7 //pu1_ref + two_nt
+ mov x14,#0 //row
+ mov x12,x4
+ bne core_loop_4
+
+core_loop_8:
+ add x8,x8,#1 //pu1_ref_main_idx += (two_nt + 1)
+ dup v0.8b,w9 //intra_pred_ang
+ lsr x12, x4, #3 //divide by 8
+
+ movi v1.8b, #32
+ mul x7, x4, x12
+
+ movi v6.8h, #31
+ //lsl x12,x3,#3
+
+ mov x1,x8
+ //sub x12,x12,x4
+ mov x5,x4
+ mov x11,#1
+
+prologue:
+ ld1 {v3.8b},[x6] //loads the row value
+ umull v2.8h, v3.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v5.8b, v2.8h,#5 //idx = pos >> 5
+
+ dup v31.8b, v4.8b[0]
+ add x0,x2,x3
+
+ umov w14, v5.2s[0] //(i row)extract idx to the r register
+ sxtw x14,w14
+
+ dup v29.8b, v4.8b[1] //(ii)
+ and x9,x14,#0xff //(i row) get the last byte
+
+ add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx]
+
+ asr x14,x14,#8 //(ii)shift by 8
+ ld1 {v8.8b},[x10],x11 //(i row)ref_main_idx
+ and x9,x14,#0xff //(ii)get the last byte
+
+ asr x14,x14,#8 //(iii)
+ ld1 {v9.8b},[x10] //(i row)ref_main_idx_1
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ and x9,x14,#0xff //(iii)
+ sub v30.8b, v1.8b , v31.8b //32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr x14,x14,#8 //(iv)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ and x9,x14,#0xff //(iv)
+
+ dup v25.8b, v4.8b[3] //(iv)
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+
+ dup v31.8b, v4.8b[4] //(v)
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ umov w14, v5.2s[1] //extract idx to the r register
+ sxtw x14,w14
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v10.8b},[x2],#8 //(i row)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ and x9,x14,#0xff //(v)
+ dup v29.8b, v4.8b[5] //(vi)
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+
+ asr x14,x14,#8 //(vi)
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and x9,x14,#0xff //(vi)
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+ dup v27.8b, v4.8b[6] //(vii)
+ asr x14,x14,#8 //(vii)
+
+ and x9,x14,#0xff //(vii)
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+
+ asr x14,x14,#8 //(viii)
+ dup v25.8b, v4.8b[7] //(viii)
+ and x9,x14,#0xff //(viii)
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ subs x4,x4,#8
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ add x20,x8,#8
+ csel x8, x20, x8,gt
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ sub x20,x7,#8
+ csel x7, x20, x7,gt
+
+ st1 {v10.8b},[x0],x3 //(v)
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+
+ beq epilogue
+
+ ld1 {v5.8b},[x6] //loads the row value
+ umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ xtn v4.8b, v4.8h
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+ umov w14, v3.2s[0] //(i)extract idx to the r register
+ sxtw x14,w14
+ and x9,x14,#0xff //(i)
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+kernel_8_rows:
+ asr x14,x14,#8 //(ii)
+ dup v31.8b, v4.8b[0]
+ subs x4,x4,#8
+
+ ld1 {v8.8b},[x10],x11 //(i)ref_main_idx
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+ and x9,x14,#0xff //(ii)
+ add x20,x6,#8 //increment the row value
+ csel x6, x20, x6,le
+
+ ld1 {v9.8b},[x10] //(i)ref_main_idx_1
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx]
+
+ ld1 {v5.8b},[x6] //loads the row value
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ asr x14,x14,#8 //(iii)
+
+ dup v29.8b, v4.8b[1] //(ii)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+ and x9,x14,#0xff //(iii)
+
+ st1 {v14.8b},[x0],x3 //(vi)
+ sub v30.8b, v1.8b , v31.8b //(i)32-fract(dup_const_32_fract)
+ add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx
+ umull v10.8h, v8.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
+ asr x14,x14,#8 //(iv)
+
+ ld1 {v13.8b},[x12] //(ii)ref_main_idx_1
+ umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and x9,x14,#0xff //(iv)
+
+ umov w14, v3.2s[1] //extract idx to the r register
+ sxtw x14,w14
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v27.8b, v4.8b[2] //(iii)
+ sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract)
+ csel x4, x5, x4,le //reload nt
+
+ ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx
+ umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx]
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v17.8b},[x10] //(iii)ref_main_idx_1
+ rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5)
+
+ dup v25.8b, v4.8b[3] //(iv)
+ umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang)
+
+ st1 {v22.8b},[x0] //(viii)
+ sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract)
+
+ ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx
+ umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ add x0,x2,x3
+
+ ld1 {v21.8b},[x12] //(iv)ref_main_idx_1
+ umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ and x9,x14,#0xff //(v)
+
+ dup v31.8b, v4.8b[4] //(v)
+ rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5)
+ add x10,x8,x9 //(v)*pu1_ref[ref_main_idx]
+
+ st1 {v10.8b},[x2],#8 //(i)
+ sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract)
+ asr x14,x14,#8 //(vi)
+
+ dup v29.8b, v4.8b[5] //(vi)
+ umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
+ and x9,x14,#0xff //(vi)
+
+ dup v27.8b, v4.8b[6] //(vii)
+ umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx]
+
+ dup v25.8b, v4.8b[7] //(viii)
+ rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5)
+ asr x14,x14,#8 //(vii)
+
+ ld1 {v8.8b},[x10],x11 //(v)ref_main_idx
+ and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31))
+ and x9,x14,#0xff //(vii)
+
+ ld1 {v9.8b},[x10] //(v)ref_main_idx_1
+ shrn v3.8b, v2.8h,#5 //idx = pos >> 5
+ asr x14,x14,#8 //(viii)
+
+ st1 {v14.8b},[x0],x3 //(ii)
+ rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5)
+ add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx]
+
+ ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx
+ sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract)
+ and x9,x14,#0xff //(viii)
+
+ ld1 {v13.8b},[x12] //(vi)ref_main_idx_1
+ umull v10.8h, v8.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ umov w14, v3.2s[0] //(i)extract idx to the r register
+ sxtw x14,w14
+ umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx]
+
+ ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx
+ sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract)
+
+ st1 {v18.8b},[x0],x3 //(iii)
+ umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
+ csel x8, x1, x8,le //reload the source to pu1_src+2nt
+
+ ld1 {v17.8b},[x10] //(vii)ref_main_idx_1
+ umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x20,x8,#8 //increment the source next set 8 columns in same row
+ csel x8, x20, x8,gt
+
+ ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx
+ rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v21.8b},[x12] //(viii)ref_main_idx_1
+ sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract)
+ lsl x20, x3,#3
+ csel x12,x20,x12,le
+
+ st1 {v22.8b},[x0],x3 //(iv)
+ umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ sub x20,x12,x5
+ csel x12, x20, x12,le
+
+ st1 {v10.8b},[x0],x3 //(v)
+ umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
+ add x20,x2,x12 //increment the dst pointer to 8*dst_strd - nt
+ csel x2, x20, x2,le
+
+ xtn v4.8b, v4.8h
+ rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5)
+ and x9,x14,#0xff //(i)
+
+ subs x7,x7,#8
+ add x10,x8,x9 //(i)*pu1_ref[ref_main_idx]
+
+ bne kernel_8_rows
+
+epilogue:
+ st1 {v14.8b},[x0],x3 //(vi)
+ rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract)
+ umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
+ umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v18.8b},[x0],x3 //(vii)
+ rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5)
+
+ st1 {v22.8b},[x0],x3 //(viii)
+ b end_loops
+
+core_loop_4:
+ add x10,x8,#1 //pu1_ref_main_idx += (two_nt + 1)
+ add x11,x8,#2 //pu1_ref_main_idx_1 += (two_nt + 2)
+ mov x8,#0
+
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#1 //pu1_ref_main_idx += 1
+ csel x10, x20, x10,gt
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+ dup v0.8b,w5 //dup_const_fract
+ sub x20,x5,#32
+ neg x4, x20
+ dup v1.8b,w4 //dup_const_32_fract
+
+//inner_loop_4
+ ld1 {v2.s}[0],[x10] //ref_main_idx
+ add x8,x8,#1
+ mov x14,x5 //fract_prev = fract
+
+ ld1 {v3.s}[0],[x11] //ref_main_idx_1
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#1 //pu1_ref_main_idx += 1
+ csel x10, x20, x10,gt
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+
+ dup v6.8b,w5 //dup_const_fract
+ umull v4.8h, v2.8b, v1.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v7.8b,w4 //dup_const_32_fract
+ umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v8.s}[0],[x10] //ref_main_idx
+ add x8,x8,#1
+
+ ld1 {v9.s}[0],[x11] //ref_main_idx_1
+ rshrn v4.8b, v4.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ mov x14,x5 //fract_prev = fract
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#1 //pu1_ref_main_idx += 1
+ csel x10, x20, x10,gt
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+
+ dup v12.8b,w5 //dup_const_fract
+ umull v10.8h, v8.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v13.8b,w4 //dup_const_32_fract
+ umlal v10.8h, v9.8b, v6.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v14.s}[0],[x10] //ref_main_idx
+ add x8,x8,#1
+
+ st1 {v4.s}[0],[x2],x3
+ rshrn v10.8b, v10.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ ld1 {v15.s}[0],[x11] //ref_main_idx_1
+ mov x14,x5 //fract_prev = fract
+ add x5,x8,#1 //row + 1
+ mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang)
+ and x5,x5,#31 //fract = pos & (31)
+ cmp x14,x5 //if(fract_prev > fract)
+ add x20,x10,#1 //pu1_ref_main_idx += 1
+ csel x10, x20, x10,gt
+ add x11,x10,#1 //pu1_ref_main_idx_1 += 1
+
+ dup v18.8b,w5 //dup_const_fract
+ umull v16.8h, v14.8b, v13.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+
+ sub x20,x5,#32
+ neg x4, x20
+ dup v19.8b,w4 //dup_const_32_fract
+ umlal v16.8h, v15.8b, v12.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ ld1 {v20.s}[0],[x10] //ref_main_idx
+
+ st1 {v10.s}[0],[x2],x3
+ rshrn v16.8b, v16.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+ ld1 {v21.s}[0],[x11] //ref_main_idx_1
+
+ umull v22.8h, v20.8b, v19.8b //vmull_u8(ref_main_idx, dup_const_32_fract)
+ umlal v22.8h, v21.8b, v18.8b //vmull_u8(ref_main_idx_1, dup_const_fract)
+
+ st1 {v16.s}[0],[x2],x3
+ rshrn v22.8b, v22.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5)
+
+ st1 {v22.s}[0],[x2],x3
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
new file mode 100644
index 0000000..b6e8601
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
@@ -0,0 +1,567 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_luma_mode_3_to_9.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_mode_3_to_9_av8
+.extern gai4_ihevc_ang_table
+.extern gai4_ihevc_inv_ang_table
+.extern col_for_intra_luma
+.extern idx_neg_idx_3_9
+
+
+.type ihevc_intra_pred_luma_mode_3_to_9_av8, %function
+
+ihevc_intra_pred_luma_mode_3_to_9_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x7, :got:gai4_ihevc_ang_table
+ ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table]
+
+ adrp x8, :got:gai4_ihevc_inv_ang_table
+ ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
+
+ add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode]
+ ldr w7, [x7] //intra_pred_ang
+ sxtw x7,w7
+ dup v30.8b,w7 //intra_pred_ang
+
+ adrp x14, :got:col_for_intra_luma
+ ldr x14, [x14, #:got_lo12:col_for_intra_luma]
+
+ cmp x4, #4
+
+ beq sz_4_proc
+ b prologue_8_16_32
+
+prologue_8_16_32:
+ lsr x10, x4, #3
+ ld1 {v31.8b},[x14],#8
+ mul x10, x4, x10 //block counter (dec by #8)
+
+ mov x11, x4 //col counter to be inc/dec by #8
+ smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+
+ sub x7, x5, #3
+ movi v2.8b, #1 //contains #1 for adding to get ref_main_idx + 1
+ adrp x12, :got:idx_neg_idx_3_9 //load least idx table
+ ldr x12, [x12, #:got_lo12:idx_neg_idx_3_9]
+ movi v3.8b, #2
+
+ add x12, x12, x7, lsl #4
+ mov x8, x12
+
+ mov x7, #8
+ sub x7, x7, x3, lsl #3 //x7 = 8-8x3
+
+ ldr w9, [x8]
+ sxtw x9,w9
+ add x1, x0, x4, lsl #1 //pu1_ref + nt
+
+ xtn v6.8b, v22.8h
+ dup v26.8b,w9 //least idx added to final idx values
+ sub x1, x1, #9 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+
+ sub x6, x1, x9
+
+ ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+ sshr v22.8h, v22.8h,#5
+
+ movi v29.8b, #31 //contains #31 for vand operation
+
+ movi v28.8b, #32
+
+ sqxtn v8.8b, v22.8h
+
+ and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
+
+ mov x0, #1
+
+ movi v27.8b, #7 //row 0 to 7
+
+ sub v8.8b, v8.8b , v2.8b //ref_main_idx (sub row)
+ sub v8.8b, v26.8b , v8.8b //ref_main_idx (row 0)
+ add v8.8b, v8.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8
+ sub v9.8b, v8.8b , v2.8b //ref_main_idx + 1 (row 0)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+ sub v4.8b, v8.8b , v2.8b //ref_main_idx (row 1)
+ sub v5.8b, v9.8b , v2.8b //ref_main_idx + 1 (row 1)
+
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
+ sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 2)
+ sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 2)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
+ sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3)
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
+ sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 4)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
+ sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
+ sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5)
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5)
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
+ sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
+ sub v9.8b, v9.8b , v3.8b //ref_main_idx + 1 (row 6)
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ umull v22.8h, v16.8b, v7.8b //mul (row 5)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
+ sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7)
+
+ st1 {v24.8b},[x2], x3 //st (row 4)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 5)
+
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v22.8b},[x2], x3 //st (row 5)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 6)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 7)
+
+ st1 {v20.8b},[x2], x3 //st (row 6)
+
+ subs x10, x10, #8 //subtract 8 and go to end if 8x8
+
+ st1 {v18.8b},[x2], x3 //st (row 7)
+
+ beq end_func
+
+ subs x11, x11, #8
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+ add x20, x2, x7
+ csel x2, x20, x2,gt
+ csel x8, x12, x8,le
+ sub x20, x2, x4
+ csel x2, x20, x2,le
+ add x20, x2, #8
+ csel x2, x20, x2,le
+ csel x11, x4, x11,le
+ bgt lbl284
+ adrp x14, :got:col_for_intra_luma
+ ldr x14, [x14, #:got_lo12:col_for_intra_luma]
+lbl284:
+ add x20, x0, #8
+ csel x0, x20, x0,le
+
+ mov x5,x2
+ ld1 {v31.8b},[x14],#8
+ smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ xtn v10.8b, v12.8h
+ sshr v12.8h, v12.8h,#5
+ sqxtn v11.8b, v12.8h
+ ldr w9, [x8]
+ sxtw x9,w9
+ add x9, x0, x9
+ sub x9, x9, #1
+ dup v26.8b,w9
+ movi v16.8b, #8
+
+ sub x4,x4,#8
+
+kernel_8_16_32:
+
+ sub v8.8b, v26.8b , v11.8b //ref_main_idx
+ mov v26.8b, v10.8b
+
+ subs x11, x11, #8
+ sub x6, x1, x9
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+ add v8.8b, v8.8b , v16.8b //to compensate the pu1_src idx incremented by 8
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx - 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ sub v9.8b, v8.8b , v2.8b //ref_main_idx - 1
+ add x20, x0, #8
+ csel x0, x20, x0,le
+ add x20, x8, #4
+ csel x8, x20, x8,gt
+ ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 5)
+
+ bgt lbl323
+ adrp x14, :got:col_for_intra_luma
+ ldr x14, [x14, #:got_lo12:col_for_intra_luma]
+lbl323:
+ csel x8, x12, x8,le
+ dup v27.8b,w0 //row value inc or reset accordingly
+
+ sub v4.8b, v8.8b , v2.8b //ref_main_idx (row 1)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ sub v5.8b, v9.8b , v2.8b //ref_main_idx - 1 (row 1)
+
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ ld1 {v31.8b},[x14],#8
+ and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
+
+ st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 2)
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
+ sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 2)
+
+ add x20, x4, #8
+ csel x11, x20, x11,le
+ ldr w9, [x8]
+ sxtw x9,w9
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3)
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 3)
+
+ umull v22.8h, v10.8b, v7.8b //mul (row 1)
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ rshrn v24.8b, v24.8h,#5 //round shft (row 0)
+ st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
+
+ sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 4)
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 4)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 2)
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 2)
+
+ smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ add x5,x2,x3,lsl#2
+ add x9, x0, x9
+
+ st1 {v24.8b},[x2], x3 //st (row 0)
+ rshrn v22.8b, v22.8h,#5 //round shft (row 1)
+
+ sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 4)
+ sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 3)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 4)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 3)
+
+ st1 {v22.8b},[x2], x3 //st (row 1)
+ rshrn v20.8b, v20.8h,#5 //round shft (row 2)
+
+ xtn v10.8b, v14.8h
+ sshr v14.8h, v14.8h,#5
+
+ sub v8.8b, v8.8b , v3.8b //ref_main_idx (row 6)
+ tbl v21.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5)
+ sub v9.8b, v9.8b , v3.8b //ref_main_idx - 1 (row 6)
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 4)
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 4)
+
+ st1 {v20.8b},[x2], x3 //st (row 2)
+ rshrn v18.8b, v18.8h,#5 //round shft (row 3)
+
+ sub x9, x9, #1
+ sqxtn v11.8b, v14.8h
+
+ sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7)
+ tbl v14.8b, {v0.16b},v8.8b //load from ref_main_idx (row 6)
+ sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 7)
+
+ umull v22.8h, v21.8b, v7.8b //mul (row 5)
+ tbl v15.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 6)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 5)
+
+ add v11.8b, v27.8b , v11.8b //ref_main_idx (add row)
+ dup v26.8b,w9
+
+ st1 {v18.8b},[x2], x3 //st (row 3)
+ rshrn v24.8b, v24.8h,#5 //round shft (row 4)
+
+ add x2, x2, x3, lsl #2
+ sub v11.8b, v11.8b , v2.8b //ref_main_idx -1 (sub 1)
+ add x20, x7, x2
+ csel x2, x20, x2,gt
+
+ sub x20, x2, x4
+ csel x2, x20, x2,le
+
+ subs x10, x10, #8 //subtract 8 and go to end if 8x8
+
+ bne kernel_8_16_32
+
+epil_8_16_32:
+ tbl v10.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7)
+
+ umull v20.8h, v14.8b, v7.8b //mul (row 6)
+ tbl v11.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7)
+ umlal v20.8h, v15.8b, v6.8b //mul (row 6)
+
+ st1 {v24.8b},[x5], x3 //st (row 4)
+ rshrn v24.8b, v22.8h,#5 //round shft (row 5)
+
+ umull v18.8h, v10.8b, v7.8b //mul (row 7)
+ umlal v18.8h, v11.8b, v6.8b //mul (row 7)
+
+ st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
+ rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
+
+ st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
+ rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
+
+ st1 {v18.8b},[x5], x3 //st (row 7)
+
+ b end_func
+
+sz_4_proc:
+ ld1 {v31.8b},[x14]
+ movi v2.8b, #1 //contains #1 for adding to get ref_main_idx - 1
+
+ movi v3.8b, #2
+ adrp x12, :got:idx_neg_idx_3_9 //load least idx table
+ ldr x12, [x12, #:got_lo12:idx_neg_idx_3_9]
+
+ smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
+ sub x7, x5, #3
+
+ add x12, x12, x7, lsl #4
+ mov x8, x12
+
+ ldr w9, [x8]
+ sxtw x9,w9
+
+ dup v26.8b,w9 //least idx added to final idx values
+ add x6, x0, x4, lsl #1 //pu1_ref + 2nt
+
+ xtn v6.8b, v22.8h
+ sub x6, x6, #9 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
+ sub x6, x6, x9
+
+ ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
+
+ movi v29.8b, #31 //contains #31 for vand operation
+
+ movi v28.8b, #32
+
+ sshr v22.8h, v22.8h,#5
+ sqxtn v8.8b, v22.8h
+
+ and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
+ sub v7.8b, v28.8b , v6.8b //32-fract
+
+ movi v27.8b, #7 //row 0 to 7(row-1)
+ sub v8.8b, v8.8b , v2.8b //ref_main_idx (add 1)
+ sub v8.8b, v26.8b , v8.8b //ref_main_idx
+ add v8.8b, v8.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8
+ sub v9.8b, v8.8b , v2.8b //ref_main_idx - 1
+
+ sub v4.8b, v8.8b , v2.8b //row 1 ref_main_idx
+ sub v5.8b, v9.8b , v2.8b
+
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 0)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 0)
+
+
+ umull v24.8h, v12.8b, v7.8b //mul (row 0)
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1)
+ umlal v24.8h, v13.8b, v6.8b //mul (row 0)
+
+ sub v8.8b, v8.8b , v3.8b //idx (row 2)
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1)
+ sub v9.8b, v9.8b , v3.8b //idx+1 (row 2)
+
+ umull v22.8h, v16.8b, v7.8b //mul (row 1)
+ tbl v12.8b, {v0.16b},v8.8b //load from ref_main_idx (row 2)
+ umlal v22.8h, v17.8b, v6.8b //mul (row 1)
+
+ rshrn v24.8b, v24.8h,#5 //round shift (row 0)
+
+ sub v4.8b, v4.8b , v3.8b //idx (row 3)
+ tbl v13.8b, {v0.16b},v9.8b //load from ref_main_idx + 1 (row 2)
+ sub v5.8b, v5.8b , v3.8b //idx+1 (row 3)
+
+ umull v20.8h, v12.8b, v7.8b //mul (row 2)
+ tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3)
+ umlal v20.8h, v13.8b, v6.8b //mul (row 2)
+
+ st1 {v24.s}[0],[x2], x3 //st row 0
+ rshrn v22.8b, v22.8h,#5 //round shift (row 1)
+
+ tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3)
+
+ umull v18.8h, v16.8b, v7.8b //mul (row 3)
+ umlal v18.8h, v17.8b, v6.8b //mul (row 3)
+
+ st1 {v22.s}[0],[x2], x3 //st row 1
+ rshrn v20.8b, v20.8h,#5 //round shift (row 2)
+
+ st1 {v20.s}[0],[x2], x3 //st row 2
+
+ rshrn v18.8b, v18.8h,#5 //round shift (row 3)
+
+ st1 {v18.s}[0],[x2], x3 //st (row 3)
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_planar.s b/common/arm64/ihevc_intra_pred_luma_planar.s
new file mode 100644
index 0000000..d2f27a2
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_planar.s
@@ -0,0 +1,569 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_filters_planar.s
+//*
+//* @brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for planar input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] pi1_coeff
+//* word8 pointer to the planar coefficients
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode,
+// word32 pi1_coeff)
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+// pi1_coeff
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_planar_av8
+.extern gau1_ihevc_planar_factor
+.extern gau1_ihevc_planar_factor_1
+
+.type ihevc_intra_pred_luma_planar_av8, %function
+
+ihevc_intra_pred_luma_planar_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
+ ldr x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
+
+ clz w5,w4
+ sub x20, x5, #32
+ neg x5, x20
+ dup v14.8h,w5
+ neg v14.8h, v14.8h //shr value (so vneg)
+ dup v2.8b,w4 //nt
+ dup v16.8h,w4 //nt
+
+ sub x6, x4, #1 //nt-1
+ add x6, x6, x0
+ ldr w7, [x6]
+ sxtw x7,w7
+ dup v0.8b,w7 //src[nt-1]
+
+ add x6, x4, x4,lsl #1 //3nt
+ add x6, x6, #1 //3nt + 1
+ add x6, x6, x0
+ ldr w7, [x6]
+ sxtw x7,w7
+ dup v1.8b,w7 //src[3nt+1]
+
+ add x6, x4, x4 //2nt
+ add x14, x6, #1 //2nt+1
+ sub x6, x6, #1 //2nt-1
+ add x6, x6, x0 //&src[2nt-1]
+ add x14, x14, x0 //&src[2nt+1]
+
+ mov x8, #1 //row+1 (row is first 0)
+ sub x9, x4, x8 //nt-1-row (row is first 0)
+
+ dup v5.8b,w8 //row + 1
+ dup v6.8b,w9 //nt - 1 - row
+ mov v7.8b, v5.8b //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
+
+ add x12, x11, #1 //coeffs (to be reloaded after every row)
+ mov x1, x4 //nt (row counter) (dec after every row)
+ mov x5, x2 //dst (to be reloaded after every row and inc by dst_strd)
+ mov x10, #8 //increment for the coeffs
+ mov x0, x14 //&src[2nt+1] (to be reloaded after every row)
+
+ cmp x4, #4
+ beq tf_sz_4
+
+//@ ========== ***************** =====================
+prolog:
+tf_sz_8_16_32:
+
+ mov x7, x4 //column counter (set to no of cols)
+ lsr x9, x4, #3 //divide nt by 8
+ mul x7, x7, x9 //multiply width * height
+ adrp x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs
+ ldr x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1]
+ sub x6, x6, #7
+ mov x8, x2
+ lsl x9, x3, #3 //4*stride
+ sub x20, x9, #8 //8-4*stride
+ neg x9, x20
+ mov x10, x4 //nt
+ sub x10, x10, #8 //nt - 8
+
+col_loop_8_16_32:
+
+ ld1 {v8.8b},[x12] //(1-8)load 8 coeffs [col+1]
+ dup v12.8h,w4 //(1)
+ ld1 {v4.8b},[x6] //(1-8)src[2nt-1-row]
+ sub v9.8b, v2.8b , v8.8b //(1-8)[nt-1-col]
+
+
+ umlal v12.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1]
+
+ ld1 {v3.8b},[x14] //(1-8)load 8 src[2nt+1+col]
+ umlal v12.8h, v8.8b, v1.8b //(1)(col+1) * src[3nt+1]
+
+ dup v20.8b, v4.8b[7] //(1)
+ umlal v12.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
+
+ dup v21.8b, v4.8b[6] //(2)
+ umlal v12.8h, v9.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row]
+
+ dup v30.8h,w4 //(2)
+ add v5.8b, v5.8b , v7.8b //(1)
+
+ sub v6.8b, v6.8b , v7.8b //(1)
+
+ dup v22.8b, v4.8b[5] //(3)
+ umlal v30.8h, v5.8b, v0.8b //(2)
+
+ dup v28.8h,w4 //(3)
+ umlal v30.8h, v8.8b, v1.8b //(2)
+
+ umlal v30.8h, v6.8b, v3.8b //(2)
+ umlal v30.8h, v9.8b, v21.8b //(2)
+
+ sshl v12.8h, v12.8h, v14.8h //(1)shr
+
+ add v5.8b, v5.8b , v7.8b //(2)
+ sub v6.8b, v6.8b , v7.8b //(2)
+
+ xtn v12.8b, v12.8h //(1)
+ umlal v28.8h, v5.8b, v0.8b //(3)
+
+ dup v23.8b, v4.8b[4] //(4)
+ umlal v28.8h, v8.8b, v1.8b //(3)
+
+ dup v10.8h,w4 //(4)
+ umlal v28.8h, v6.8b, v3.8b //(3)
+
+ st1 {v12.8b},[x2], x3 //(1)str 8 values
+ umlal v28.8h, v9.8b, v22.8b //(3)
+
+ sshl v30.8h, v30.8h, v14.8h //(2)shr
+
+ add v5.8b, v5.8b , v7.8b //(3)
+ sub v6.8b, v6.8b , v7.8b //(3)
+
+ xtn v30.8b, v30.8h //(2)
+ umlal v10.8h, v5.8b, v0.8b //(4)
+
+ dup v20.8b, v4.8b[3] //(5)
+ umlal v10.8h, v8.8b, v1.8b //(4)
+
+ dup v16.8h,w4 //(5)
+ umlal v10.8h, v6.8b, v3.8b //(4)
+
+ st1 {v30.8b},[x2], x3 //(2)str 8 values
+ umlal v10.8h, v9.8b, v23.8b //(4)
+
+ sshl v28.8h, v28.8h, v14.8h //(3)shr
+
+ add v5.8b, v5.8b , v7.8b //(4)
+ sub v6.8b, v6.8b , v7.8b //(4)
+
+ xtn v28.8b, v28.8h //(3)
+ umlal v16.8h, v5.8b, v0.8b //(5)
+
+ dup v21.8b, v4.8b[2] //(6)
+ umlal v16.8h, v8.8b, v1.8b //(5)
+
+ dup v18.8h,w4 //(6)
+ umlal v16.8h, v6.8b, v3.8b //(5)
+
+ st1 {v28.8b},[x2], x3 //(3)str 8 values
+ umlal v16.8h, v9.8b, v20.8b //(5)
+
+ sshl v10.8h, v10.8h, v14.8h //(4)shr
+ add v5.8b, v5.8b , v7.8b //(5)
+ sub v6.8b, v6.8b , v7.8b //(5)
+
+ xtn v10.8b, v10.8h //(4)
+ umlal v18.8h, v5.8b, v0.8b //(6)
+
+ dup v22.8b, v4.8b[1] //(7)
+ umlal v18.8h, v8.8b, v1.8b //(6)
+
+ dup v26.8h,w4 //(7)
+ umlal v18.8h, v6.8b, v3.8b //(6)
+
+ st1 {v10.8b},[x2], x3 //(4)str 8 values
+ umlal v18.8h, v9.8b, v21.8b //(6)
+
+ sshl v16.8h, v16.8h, v14.8h //(5)shr
+
+ add v5.8b, v5.8b , v7.8b //(6)
+ sub v6.8b, v6.8b , v7.8b //(6)
+
+ xtn v16.8b, v16.8h //(5)
+ umlal v26.8h, v5.8b, v0.8b //(7)
+
+ dup v23.8b, v4.8b[0] //(8)
+ umlal v26.8h, v8.8b, v1.8b //(7)
+
+ dup v24.8h,w4 //(8)
+ umlal v26.8h, v6.8b, v3.8b //(7)
+
+ st1 {v16.8b},[x2], x3 //(5)str 8 values
+ umlal v26.8h, v9.8b, v22.8b //(7)
+
+ sshl v18.8h, v18.8h, v14.8h //(6)shr
+
+ add v5.8b, v5.8b , v7.8b //(7)
+ sub v6.8b, v6.8b , v7.8b //(7)
+
+ xtn v18.8b, v18.8h //(6)
+ umlal v24.8h, v5.8b, v0.8b //(8)
+
+
+ umlal v24.8h, v8.8b, v1.8b //(8)
+
+ umlal v24.8h, v6.8b, v3.8b //(8)
+
+ st1 {v18.8b},[x2], x3 //(6)str 8 values
+ umlal v24.8h, v9.8b, v23.8b //(8)
+
+ sshl v26.8h, v26.8h, v14.8h //(7)shr
+
+ subs x7, x7, #8
+
+ beq epilog
+
+ subs x1, x1, #8 //row counter
+ add x20, x12, #8 //col inc
+ csel x12, x20, x12,gt
+ add x20, x14, #8 //also for col inc
+ csel x14, x20, x14,gt
+ csel x1, x4, x1,le //nt reloaded (refresh the value)
+ add x20, x11, #1 //x12 reset
+ csel x12, x20, x12,le
+
+ csel x14, x0, x14,le //x14 reset
+ ld1 {v8.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1]
+
+ sub x20, x6, #8 //for next set of rows
+ csel x6, x20, x6,le
+ ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col]
+
+ add x20, x5, #8
+ csel x5, x20, x5,le
+ dup v12.8h,w4 //(1n)(1)
+
+ ld1 {v5.8b},[x5]
+
+ ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row]
+ sub v9.8b, v2.8b , v8.8b //(1n)(1-8)[nt-1-col]
+
+ dup v20.8b, v4.8b[7] //(1n)(1)
+ sub v6.8b, v2.8b , v5.8b
+
+ beq epilog
+
+kernel_plnr:
+
+ cmp x1, #0 // (cond loop)
+ sshl v24.8h, v24.8h, v14.8h //(8)shr
+
+ xtn v26.8b, v26.8h //(7)
+ umlal v12.8h, v5.8b, v0.8b //(1)(row+1) * src[nt-1]
+
+ xtn v24.8b, v24.8h //(8)
+ umlal v12.8h, v8.8b, v1.8b //(1)(col+1) * src[3nt+1]
+
+ dup v21.8b, v4.8b[6] //(2)
+ umlal v12.8h, v6.8b, v3.8b //(1)(nt-1-row) * src[2nt+1+col]
+
+ dup v30.8h,w4 //(2)
+ umlal v12.8h, v9.8b, v20.8b //(1)(nt-1-col) * src[2nt-1-row]
+
+ st1 {v26.8b},[x2], x3 //(7)str 8 values
+ add v5.8b, v5.8b , v7.8b //(1)
+
+ st1 {v24.8b},[x2], x3 //(8)str 8 values
+ sub v6.8b, v6.8b , v7.8b //(1)
+
+ add x20, x2, x9 //since more cols to fill, dst + 8 - 6*strd (cond loop)
+ csel x2, x20, x2,gt
+ umlal v30.8h, v5.8b, v0.8b //(2)
+
+ sub x20, x2, x10 //else go to next set of rows, dst - (nt-8) (cond loop)
+ csel x2, x20, x2,le
+ umlal v30.8h, v8.8b, v1.8b //(2)
+
+ dup v22.8b, v4.8b[5] //(3)
+ umlal v30.8h, v6.8b, v3.8b //(2)
+
+ dup v28.8h,w4 //(3)
+ umlal v30.8h, v9.8b, v21.8b //(2)
+
+ sshl v12.8h, v12.8h, v14.8h //(1)shr
+
+ add v5.8b, v5.8b , v7.8b //(2)
+ csel x1, x4, x1,le //nt reloaded (refresh the value) (cond loop)
+
+ sub v6.8b, v6.8b , v7.8b //(2)
+ subs x1, x1, #8 //row counter (loop)
+
+ xtn v12.8b, v12.8h //(1)
+ umlal v28.8h, v5.8b, v0.8b //(3)
+
+ dup v23.8b, v4.8b[4] //(4)
+ umlal v28.8h, v8.8b, v1.8b //(3)
+
+ dup v10.8h,w4 //(4)
+ umlal v28.8h, v6.8b, v3.8b //(3)
+
+ st1 {v12.8b},[x2], x3 //(1)str 8 values
+ umlal v28.8h, v9.8b, v22.8b //(3)
+
+ sshl v30.8h, v30.8h, v14.8h //(2)shr
+
+ add v5.8b, v5.8b , v7.8b //(3)
+
+ sub v6.8b, v6.8b , v7.8b //(3)
+
+ xtn v30.8b, v30.8h //(2)
+ umlal v10.8h, v5.8b, v0.8b //(4)
+
+ dup v20.8b, v4.8b[3] //(5)
+ umlal v10.8h, v8.8b, v1.8b //(4)
+
+ dup v16.8h,w4 //(5)
+ umlal v10.8h, v6.8b, v3.8b //(4)
+
+ st1 {v30.8b},[x2], x3 //(2)str 8 values
+ umlal v10.8h, v9.8b, v23.8b //(4)
+
+ sshl v28.8h, v28.8h, v14.8h //(3)shr
+
+ add v5.8b, v5.8b , v7.8b //(4)
+
+ sub v6.8b, v6.8b , v7.8b //(4)
+
+ xtn v28.8b, v28.8h //(3)
+ umlal v16.8h, v5.8b, v0.8b //(5)
+
+ dup v21.8b, v4.8b[2] //(6)
+ umlal v16.8h, v8.8b, v1.8b //(5)
+
+ dup v18.8h,w4 //(6)
+ umlal v16.8h, v6.8b, v3.8b //(5)
+
+ st1 {v28.8b},[x2], x3 //(3)str 8 values
+ umlal v16.8h, v9.8b, v20.8b //(5)
+
+ add x20, x11, #1 //x12 reset (cond loop)
+ csel x12, x20, x12,le
+ sshl v10.8h, v10.8h, v14.8h //(4)shr
+
+ add x20, x12, #8 //col inc (cond loop)
+ csel x12, x20, x12,gt
+ add v5.8b, v5.8b , v7.8b //(5)
+
+ add x20, x14, #8 //also for col inc (cond loop)
+ csel x14, x20, x14,gt
+ sub v6.8b, v6.8b , v7.8b //(5)
+
+ xtn v10.8b, v10.8h //(4)
+ umlal v18.8h, v5.8b, v0.8b //(6)
+
+ dup v22.8b, v4.8b[1] //(7)
+ umlal v18.8h, v8.8b, v1.8b //(6)
+
+ dup v26.8h,w4 //(7)
+ umlal v18.8h, v6.8b, v3.8b //(6)
+
+ st1 {v10.8b},[x2], x3 //(4)str 8 values
+ umlal v18.8h, v9.8b, v21.8b //(6)
+
+ csel x14, x0, x14,le //x14 reset (cond loop)
+ sshl v16.8h, v16.8h, v14.8h //(5)shr
+
+ sub x20, x6, #8 //for next set of rows (cond loop)
+ csel x6, x20, x6,le
+ add v5.8b, v5.8b , v7.8b //(6)
+
+ add x20, x5, #8 // (cond loop)
+ csel x5, x20, x5,le
+ sub v6.8b, v6.8b , v7.8b //(6)
+
+ xtn v16.8b, v16.8h //(5)
+ umlal v26.8h, v5.8b, v0.8b //(7)
+
+ dup v23.8b, v4.8b[0] //(8)
+ umlal v26.8h, v8.8b, v1.8b //(7)
+
+ dup v24.8h,w4 //(8)
+ umlal v26.8h, v6.8b, v3.8b //(7)
+
+ st1 {v16.8b},[x2], x3 //(5)str 8 values
+ umlal v26.8h, v9.8b, v22.8b //(7)
+
+ ld1 {v4.8b},[x6] //(1n)(1-8)src[2nt-1-row]
+ sshl v18.8h, v18.8h, v14.8h //(6)shr
+
+ add v5.8b, v5.8b , v7.8b //(7)
+
+ sub v6.8b, v6.8b , v7.8b //(7)
+
+ xtn v18.8b, v18.8h //(6)
+ umlal v24.8h, v5.8b, v0.8b //(8)
+
+ ld1 {v5.8b},[x5] //(row+1 value)
+ umlal v24.8h, v8.8b, v1.8b //(8)
+
+ dup v20.8b, v4.8b[7] //(1n)(1)
+ umlal v24.8h, v6.8b, v3.8b //(8)
+
+ st1 {v18.8b},[x2], x3 //(6)str 8 values
+ umlal v24.8h, v9.8b, v23.8b //(8)
+
+ ld1 {v8.8b},[x12] //(1n)(1-8)load 8 coeffs [col+1]
+ sub v6.8b, v2.8b , v5.8b //(nt-1-row) value
+
+ subs x7, x7, #8 //col counter
+
+ ld1 {v3.8b},[x14] //(1n)(1-8)load 8 src[2nt+1+col]
+ sshl v26.8h, v26.8h, v14.8h //(7)shr
+
+ dup v12.8h,w4 //(1n)(1)
+ sub v9.8b, v2.8b , v8.8b //(1n)(1-8)[nt-1-col]
+
+ bne kernel_plnr
+
+epilog:
+
+ xtn v26.8b, v26.8h //(7)
+ st1 {v26.8b},[x2], x3 //(7)str 8 values
+
+ sshl v24.8h, v24.8h, v14.8h //(8)shr
+ xtn v24.8b, v24.8h //(8)
+ st1 {v24.8b},[x2], x3 //(8)str 8 values
+
+//@ ========== ***************** =====================
+
+ beq end_loop
+
+tf_sz_4:
+ ld1 {v10.8b},[x14] //load src[2nt+1+col]
+ ld1 {v8.8b},[x12], x10 //load 8 coeffs [col+1]
+loop_sz_4:
+ mov x10, #4 //reduce inc to #4 for 4x4
+ ldr w7, [x6], #-1 //src[2nt-1-row] (dec to take into account row)
+ sxtw x7,w7
+ dup v4.8b,w7 //src[2nt-1-row]
+
+ sub v9.8b, v2.8b , v8.8b //[nt-1-col]
+
+ umull v12.8h, v5.8b, v0.8b //(row+1) * src[nt-1]
+ umlal v12.8h, v6.8b, v10.8b //(nt-1-row) * src[2nt+1+col]
+ umlal v12.8h, v8.8b, v1.8b //(col+1) * src[3nt+1]
+ umlal v12.8h, v9.8b, v4.8b //(nt-1-col) * src[2nt-1-row]
+// vadd.i16 q6, q6, q8 @add (nt)
+// vshl.s16 q6, q6, q7 @shr
+// vmovn.i16 d12, q6
+ rshrn v12.8b, v12.8h,#3
+ st1 {v12.s}[0],[x2], x3
+
+ add v5.8b, v5.8b , v7.8b //row++ [(row+1)++]
+ sub v6.8b, v6.8b , v7.8b //[nt-1-row]--
+ subs x1, x1, #1
+
+ bne loop_sz_4
+
+end_loop:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_intra_pred_luma_vert.s b/common/arm64/ihevc_intra_pred_luma_vert.s
new file mode 100644
index 0000000..56a20a0
--- /dev/null
+++ b/common/arm64/ihevc_intra_pred_luma_vert.s
@@ -0,0 +1,432 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_intra_pred_filters_vert.s
+//*
+//* @brief
+//* contains function definitions for intra prediction dc filtering.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* akshaya mukund
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* luma intraprediction filter for dc input
+//*
+//* @par description:
+//*
+//* @param[in] pu1_ref
+//* uword8 pointer to the source
+//*
+//* @param[out] pu1_dst
+//* uword8 pointer to the destination
+//*
+//* @param[in] src_strd
+//* integer source stride
+//*
+//* @param[in] dst_strd
+//* integer destination stride
+//*
+//* @param[in] nt
+//* size of tranform block
+//*
+//* @param[in] mode
+//* type of filtering
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_intra_pred_luma_ver(uword8* pu1_ref,
+// word32 src_strd,
+// uword8* pu1_dst,
+// word32 dst_strd,
+// word32 nt,
+// word32 mode)
+//
+//**************variables vs registers*****************************************
+//x0 => *pu1_ref
+//x1 => src_strd
+//x2 => *pu1_dst
+//x3 => dst_strd
+
+//stack contents from #40
+// nt
+// mode
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevc_intra_pred_luma_ver_av8
+
+.type ihevc_intra_pred_luma_ver_av8, %function
+
+ihevc_intra_pred_luma_ver_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ lsl x5, x4, #1 //2nt
+
+ cmp x4, #16
+ beq blk_16
+ blt blk_4_8
+
+ add x5, x5, #1 //2nt+1
+ add x6, x0, x5 //&src[2nt+1]
+
+copy_32:
+ add x5, x2, x3
+ ld1 {v20.8b, v21.8b}, [x6],#16 //16 loads (col 0:15)
+ add x8, x5, x3
+
+ add x10, x8, x3
+ ld1 {v22.8b, v23.8b}, [x6] //16 loads (col 16:31)
+ lsl x11, x3, #2
+
+ add x11, x11, #-16
+ st1 {v20.8b, v21.8b}, [x2],#16
+ st1 {v20.8b, v21.8b}, [x5],#16
+ st1 {v20.8b, v21.8b}, [x8],#16
+ st1 {v20.8b, v21.8b}, [x10],#16
+
+ st1 {v22.8b, v23.8b}, [x2], x11
+ st1 {v22.8b, v23.8b}, [x5], x11
+ st1 {v22.8b, v23.8b}, [x8], x11
+ st1 {v22.8b, v23.8b}, [x10], x11
+
+ subs x4, x4, #8
+
+kernel_copy_32:
+ st1 {v20.8b, v21.8b}, [x2],#16
+ st1 {v20.8b, v21.8b}, [x5],#16
+ st1 {v20.8b, v21.8b}, [x8],#16
+ st1 {v20.8b, v21.8b}, [x10],#16
+
+ st1 {v22.8b, v23.8b}, [x2], x11
+ st1 {v22.8b, v23.8b}, [x5], x11
+ st1 {v22.8b, v23.8b}, [x8], x11
+ st1 {v22.8b, v23.8b}, [x10], x11
+
+ subs x4, x4, #8
+
+ st1 {v20.8b, v21.8b}, [x2],#16
+ st1 {v20.8b, v21.8b}, [x5],#16
+ st1 {v20.8b, v21.8b}, [x8],#16
+ st1 {v20.8b, v21.8b}, [x10],#16
+
+ st1 {v22.8b, v23.8b}, [x2], x11
+ st1 {v22.8b, v23.8b}, [x5], x11
+ st1 {v22.8b, v23.8b}, [x8], x11
+ st1 {v22.8b, v23.8b}, [x10], x11
+
+ bne kernel_copy_32
+
+ st1 {v20.8b, v21.8b}, [x2],#16
+ st1 {v20.8b, v21.8b}, [x5],#16
+ st1 {v20.8b, v21.8b}, [x8],#16
+ st1 {v20.8b, v21.8b}, [x10],#16
+
+ st1 {v22.8b, v23.8b}, [x2], x11
+ st1 {v22.8b, v23.8b}, [x5], x11
+ st1 {v22.8b, v23.8b}, [x8], x11
+ st1 {v22.8b, v23.8b}, [x10], x11
+
+ b end_func
+
+blk_16:
+ add x6, x0, x5 //&src[2nt]
+
+ ldrb w11, [x6], #1 //src[2nt]
+ sxtw x11,w11
+
+ dup v22.16b,w11 //src[2nt]
+ ldrb w12, [x6] //src[2nt+1]
+ sxtw x12,w12
+
+ ld1 {v16.8b, v17.8b}, [x6] //ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores)
+ add x6, x6, #-17 //subtract -9 to take it to src[2nt-1-row(15)]
+
+ dup v24.16b,w12 //src[2nt+1]
+ dup v30.8h,w12
+ lsl x5, x3, #3 //8*stride
+
+ ld1 {v26.16b}, [x6],#16 //load src[2nt-1-row](rows 0:15)
+ add x5, x2, x5 //x5 ->
+
+ movi d18, #0x00000000000000ff
+ uhsub v26.16b, v26.16b , v22.16b //(src[2nt-1-row] - src[2nt])>>1
+ //vsubl.u8 q0, d26, d22
+ //vsubl.u8 q14, d27, d22
+
+ //vshr.s16 q0, q0, #1
+ //vshr.s16 q14, q14, #1
+
+ mov v19.d[0],v17.d[0]
+ //vaddl.s8 q0, d24, d26
+ sxtl v0.8h, v26.8b
+ sxtl2 v28.8h, v26.16b
+ sqadd v0.8h, v0.8h , v30.8h
+ sqadd v28.8h, v28.8h , v30.8h
+
+ movi d10, #0x00000000000000ff
+ //vaddl.s8 q1, d25, d27
+
+ sqxtun v24.8b, v28.8h
+ sqxtun2 v24.16b, v0.8h
+ //vmovn.u16 d25, q0
+ //vmovn.u16 d24, q1
+
+ rev64 v24.16b, v24.16b
+ mov v25.d[0], v24.d[1]
+
+ mov v11.d[0],v17.d[0]
+
+ bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
+ bsl v10.8b, v25.8b , v16.8b
+
+ movi d8, #0x00000000000000ff
+ mov v9.d[0],v17.d[0]
+
+ movi d6, #0x00000000000000ff
+ mov v7.d[0],v17.d[0]
+
+ st1 {v18.8b, v19.8b}, [x2], x3
+ sshr d24, d24,#8
+
+ st1 {v10.8b, v11.8b}, [x5], x3
+ sshr d25, d25,#8
+
+
+ bsl v8.8b, v24.8b , v16.8b
+ bsl v6.8b, v25.8b , v16.8b
+
+ st1 {v8.8b, v9.8b}, [x2], x3
+ sshr d24, d24,#8
+
+ st1 {v6.8b, v7.8b}, [x5], x3
+ sshr d25, d25,#8
+
+ subs x4, x4,#8
+
+ movi d18, #0x00000000000000ff
+ //vmov.i64 d19, d17
+
+ movi d10, #0x00000000000000ff
+ //vmov.i64 d11, d17
+
+
+loop_16:
+
+
+ movi d8, #0x00000000000000ff
+
+ movi d6, #0x00000000000000ff
+
+ bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
+ bsl v10.8b, v25.8b , v16.8b
+
+ st1 {v18.8b, v19.8b}, [x2], x3
+ sshr d24, d24,#8
+
+ st1 {v10.8b, v11.8b}, [x5], x3
+ sshr d25, d25,#8
+
+ movi d18, #0x00000000000000ff
+
+ movi d10, #0x00000000000000ff
+
+ bsl v8.8b, v24.8b , v16.8b
+ bsl v6.8b, v25.8b , v16.8b
+
+ st1 {v8.8b, v9.8b}, [x2], x3
+ sshr d24, d24,#8
+
+ st1 {v6.8b, v7.8b}, [x5], x3
+ sshr d25, d25,#8
+
+ subs x4, x4, #4
+
+ bne loop_16
+
+ movi d8, #0x00000000000000ff
+
+ movi d6, #0x00000000000000ff
+
+ bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
+ bsl v10.8b, v25.8b , v16.8b
+
+ st1 {v18.8b, v19.8b}, [x2], x3
+ sshr d24, d24,#8
+
+ st1 {v10.8b, v11.8b}, [x5], x3
+ sshr d25, d25,#8
+
+ bsl v8.8b, v24.8b , v16.8b
+ bsl v6.8b, v25.8b , v16.8b
+
+ st1 {v8.8b, v9.8b}, [x2], x3
+
+ st1 {v6.8b, v7.8b}, [x5], x3
+
+ b end_func
+
+
+blk_4_8:
+ movi d11, #0x00000000000000ff
+ add x6, x0, x5 //&src[2nt]
+
+ movi d10, #0x00000000000000ff
+ ldrb w11, [x6], #1 //src[2nt]
+ sxtw x11,w11
+
+ dup v22.8b,w11 //src[2nt]
+ ldrb w12, [x6] //src[2nt+1]
+ sxtw x12,w12
+
+ ld1 {v16.8b},[x6] //ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st)
+ add x6, x6, #-9 //subtract -9 to take it to src[2nt-1-row(15)]
+
+ dup v24.8b,w12 //src[2nt+1]
+ dup v30.8h,w12
+
+ ld1 {v26.8b},[x6],#8 //load src[2nt-1-row](rows 0:15)
+
+ movi d18, #0x00000000000000ff
+ uhsub v26.8b, v26.8b , v22.8b //(src[2nt-1-row] - src[2nt])>>1
+ //vsubl.u8 q13, d26, d22
+
+ //vshr.s16 q13, q13, #1
+
+ movi d19, #0x00000000000000ff
+ sxtl v26.8h, v26.8b
+ //vaddl.s8 q0, d24, d26
+ sqadd v0.8h, v26.8h , v30.8h
+
+ sqxtun v24.8b, v0.8h
+ //vmovn.s16 d24, q0
+
+ rev64 v24.8b, v24.8b
+
+ cmp x4, #4
+ beq blk_4
+
+ bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
+
+ st1 {v18.8b},[x2], x3
+ sshr d24, d24,#8
+
+ movi d18, #0x00000000000000ff
+
+ bsl v19.8b, v24.8b , v16.8b
+
+ st1 {v19.8b},[x2], x3
+ sshr d24, d24,#8
+
+ movi d19, #0x00000000000000ff
+
+ bsl v10.8b, v24.8b , v16.8b
+
+ st1 {v10.8b},[x2], x3
+ sshr d24, d24,#8
+
+ movi d10, #0x00000000000000ff
+
+ bsl v11.8b, v24.8b , v16.8b
+
+ st1 {v11.8b},[x2], x3
+ sshr d24, d24,#8
+
+ movi d11, #0x00000000000000ff
+
+ bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
+
+ st1 {v18.8b},[x2], x3
+ sshr d24, d24,#8
+
+ bsl v19.8b, v24.8b , v16.8b
+
+ st1 {v19.8b},[x2], x3
+ sshr d24, d24,#8
+
+ bsl v10.8b, v24.8b , v16.8b
+
+ st1 {v10.8b},[x2], x3
+ sshr d24, d24,#8
+
+ bsl v11.8b, v24.8b , v16.8b
+
+ st1 {v11.8b},[x2], x3
+ sshr d24, d24,#8
+
+ b end_func
+
+
+blk_4:
+ bsl v18.8b, v24.8b , v16.8b //only select row values from q12(predpixel)
+
+ st1 {v18.s}[0],[x2], x3
+ sshr d24, d24,#8
+
+ bsl v19.8b, v24.8b , v16.8b
+
+ st1 {v19.s}[0],[x2], x3
+ sshr d24, d24,#8
+
+ bsl v10.8b, v24.8b , v16.8b
+
+ st1 {v10.s}[0],[x2], x3
+ sshr d24, d24,#8
+
+ bsl v11.8b, v24.8b , v16.8b
+ st1 {v11.s}[0],[x2], x3
+
+
+end_func:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
diff --git a/common/arm64/ihevc_itrans_recon_16x16.s b/common/arm64/ihevc_itrans_recon_16x16.s
new file mode 100644
index 0000000..90df840
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_16x16.s
@@ -0,0 +1,1240 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// * ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// * contains function definitions for single stage inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// * - ihevc_itrans_recon_16x16()
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * this function performs inverse transform and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// * performs inverse transform and adds the prediction data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// * input 16x16 coefficients
+// *
+// * @param[in] pi2_tmp
+// * temporary 16x16 buffer for storing inverse
+// *
+// * transform
+// * 1st stage output
+// *
+// * @param[in] pu1_pred
+// * prediction 16x16 block
+// *
+// * @param[out] pu1_dst
+// * output 8x8 block
+// *
+// * @param[in] src_strd
+// * input stride
+// *
+// * @param[in] pred_strd
+// * prediction stride
+// *
+// * @param[in] dst_strd
+// * output stride
+// *
+// * @param[in] shift
+// * output shift
+// *
+// * @param[in] x12
+// * zero columns in pi2_src
+// *
+// * @returns void
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_16x16(word16 *pi2_src,
+// word16 *pi2_tmp,
+// uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 x12
+// word32 x11 )
+
+//**************variables vs registers*************************
+// x0 => *pi2_src
+// x1 => *pi2_tmp
+// x2 => *pu1_pred
+// x3 => *pu1_dst
+// src_strd
+// pred_strd
+// dst_strd
+// x12
+// x11
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+//#define zero_cols x12
+//#define zero_rows x11
+.globl ihevc_itrans_recon_16x16_av8
+
+.extern g_ai2_ihevc_trans_16_transpose
+
+.type ihevc_itrans_recon_16x16_av8, %function
+
+ihevc_itrans_recon_16x16_av8:
+
+ ldr w11, [sp]
+ // stmfd sp!,{x4-x12,x14}
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x5, x6,[sp,#-16]!
+// add sp,sp,#40
+
+
+
+// ldr x8,[sp,#4] @ prediction stride
+// ldr x7,[sp,#8] @ destination stride
+ mov x6, x4 // src stride
+ mov x12, x7
+
+
+
+ adrp x14, :got:g_ai2_ihevc_trans_16_transpose
+ ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose]
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data
+ mov x7,#0xffff
+ and x12,x12,x7
+ and x11,x11,x7
+ lsl x6, x6, #1 // x sizeof(word16)
+ add x9,x0,x6, lsl #1 // 2 rows
+
+ add x10,x6,x6, lsl #1 // 3 rows
+ add x5,x6,x6,lsl #2
+ mov x7,#0xfff0
+
+ cmp x12,x7
+ bge zero_12cols_decision
+
+ mov x19,#0xff00
+ cmp x12,x19
+ bge zero_8cols_decision
+
+
+
+
+ mov x14,#4
+ cmp x11,x7
+ sub x20,x6,#0
+ neg x20, x20
+ csel x10,x20,x10,ge
+
+ mov x19,#0xff00
+ cmp x11,x19
+ csel x8, x5, x8,ge
+ sub x20,x8,#0
+ neg x20, x20
+ csel x8,x20,x8,ge
+ csel x8, x10, x8,lt
+ add x5,x5,x6,lsl #3
+ sub x20,x5,#0
+ neg x5, x20
+
+ b first_stage_top_four_bottom_four
+
+zero_12cols_decision:
+ mov x14,#1
+ mov x19,#0xff00
+ cmp x11,x19
+ csel x8, x5, x8,ge
+ csel x8, x10, x8,lt
+ add x5,x5,x6,lsl #3
+ sub x20,x5,#0
+ neg x5, x20
+
+ b first_stage_top_four_bottom_four
+
+zero_8cols_decision:
+ mov x14,#2
+ mov x8,x5
+ sub x20,x8,#0
+ neg x8, x20
+ mov x19,#0xff00
+ cmp x11,x19
+ csel x8, x10, x8,lt
+ add x5,x5,x6,lsl #3
+ sub x20,x5,#0
+ neg x5, x20
+ cmp x11,x7
+ sub x20,x6,#0
+ neg x20, x20
+ csel x10,x20,x10,ge
+
+
+ b first_stage_top_four_bottom_four
+
+
+//d0[0]= 64 d2[0]=64
+//d0[1]= 90 d2[1]=57
+//d0[2]= 89 d2[2]=50
+//d0[3]= 87 d2[3]=43
+//d1[0]= 83 d3[0]=36
+//d1[1]= 80 d3[1]=25
+//d1[2]= 75 d3[2]=18
+//d1[3]= 70 d3[3]=9
+
+
+
+first_stage:
+ add x0,x0,#8
+ add x9,x9,#8
+
+first_stage_top_four_bottom_four:
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ cmp x11,x7
+ bge skip_load4rows
+
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x8
+ ld1 {v9.4h},[x9],x8
+
+// registers used: q0,q1,q3,q5,q2,q4
+
+// d10 =x0
+//d6= x1
+//d11=x2
+//d7=x3
+
+skip_load4rows:
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+ smull v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v0.4h[2]
+ smull v14.4s, v10.4h, v0.4h[0]
+ smlal v14.4s, v11.4h, v1.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+
+ bge skip_last12rows_kernel1
+
+
+ smlal v24.4s, v8.4h, v1.4h[1]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v1.4h[3]
+ smlsl v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v2.4h[3]
+ smlsl v28.4s, v9.4h, v0.4h[3]
+ smlal v30.4s, v9.4h, v3.4h[3]
+
+
+
+
+
+ smlal v12.4s, v4.4h, v1.4h[0]
+ smlal v12.4s, v5.4h, v1.4h[2]
+ smlal v14.4s, v4.4h, v3.4h[0]
+ smlsl v14.4s, v5.4h, v3.4h[2]
+ smlsl v16.4s, v4.4h, v3.4h[0]
+ smlsl v16.4s, v5.4h, v0.4h[2]
+ smlsl v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v2.4h[2]
+
+//d0[0]= 64 d2[0]=64
+//d0[1]= 90 d2[1]=57
+//d0[2]= 89 d2[2]=50
+//d0[3]= 87 d2[3]=43
+//d1[0]= 83 d3[0]=36
+//d1[1]= 80 d3[1]=25
+//d1[2]= 75 d3[2]=18
+//d1[3]= 70 d3[3]=9
+ mov x19,#0xff00
+ cmp x11,x19
+ bge skip_last12rows_kernel1
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x5
+ ld1 {v9.4h},[x9],x5
+
+
+
+
+ smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ smlal v24.4s, v8.4h, v3.4h[1]
+ smlsl v26.4s, v8.4h, v1.4h[3]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlsl v30.4s, v8.4h, v1.4h[1]
+
+
+ smlal v24.4s, v9.4h, v3.4h[3]
+ smlsl v26.4s, v9.4h, v3.4h[1]
+ smlal v28.4s, v9.4h, v2.4h[3]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+ smlal v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v2.4h[2]
+ smlal v12.4s, v4.4h, v3.4h[0]
+ smlal v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+ smlsl v14.4s, v10.4h, v0.4h[0]
+ smlsl v14.4s, v11.4h, v0.4h[2]
+ smlsl v14.4s, v4.4h, v1.4h[0]
+ smlsl v14.4s, v5.4h, v2.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v3.4h[2]
+ smlal v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v1.4h[2]
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v1.4h[2]
+ smlsl v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v0.4h[2]
+
+skip_last12rows_kernel1:
+ add v20.4s, v12.4s , v24.4s
+ sub v22.4s, v12.4s , v24.4s
+
+ add v12.4s, v14.4s , v26.4s
+ sub v24.4s, v14.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+
+
+
+
+
+ sqrshrn v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ st1 {v30.4h, v31.4h},[x1],#16
+ st1 {v18.4h, v19.4h},[x1],#16
+ sub x1,x1,#32
+
+ bge skip_stage1_kernel_load
+
+first_stage_middle_eight:
+
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x8
+ ld1 {v9.4h},[x9],x8
+
+
+skip_stage1_kernel_load:
+ smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v3.4h[2]
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v2.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v1.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v0.4h[2]
+
+
+ cmp x11,x7
+ bge skip_last12rows_kernel2
+
+ smlsl v24.4s, v8.4h, v3.4h[1]
+ smlal v26.4s, v8.4h, v2.4h[1]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlal v30.4s, v8.4h, v2.4h[3]
+
+
+ smlal v24.4s, v9.4h, v0.4h[1]
+ smlal v26.4s, v9.4h, v3.4h[1]
+ smlsl v28.4s, v9.4h, v1.4h[1]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+ smlsl v22.4s, v4.4h, v1.4h[0]
+ smlal v22.4s, v5.4h, v2.4h[2]
+ smlsl v20.4s, v4.4h, v3.4h[0]
+ smlal v20.4s, v5.4h, v0.4h[2]
+ smlal v16.4s, v4.4h, v3.4h[0]
+ smlal v16.4s, v5.4h, v3.4h[2]
+ smlal v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v1.4h[2]
+
+//d0[0]= 64 d2[0]=64
+//d0[1]= 90 d2[1]=57
+//d0[2]= 89 d2[2]=50
+//d0[3]= 87 d2[3]=43
+//d1[0]= 83 d3[0]=36
+//d1[1]= 80 d3[1]=25
+//d1[2]= 75 d3[2]=18
+//d1[3]= 70 d3[3]=9
+ mov x19,#0xff00
+ cmp x11,x19
+ bge skip_last12rows_kernel2
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v11.4h},[x9],x6
+ ld1 {v6.4h},[x0],x10
+ ld1 {v7.4h},[x9],x10
+ ld1 {v4.4h},[x0],x6
+ ld1 {v5.4h},[x9],x6
+ ld1 {v8.4h},[x0],x5
+ ld1 {v9.4h},[x9],x5
+
+
+ smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smlal v24.4s, v8.4h, v2.4h[3]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v2.4h[1]
+ smlal v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v1.4h[1]
+ smlal v28.4s, v9.4h, v0.4h[3]
+ smlsl v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+ smlal v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v1.4h[2]
+ smlsl v22.4s, v4.4h, v3.4h[0]
+ smlal v22.4s, v5.4h, v0.4h[2]
+
+
+
+ smlsl v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+ smlal v20.4s, v4.4h, v1.4h[0]
+ smlsl v20.4s, v5.4h, v1.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+ smlsl v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v2.4h[2]
+
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+ smlal v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v3.4h[2]
+
+skip_last12rows_kernel2:
+
+ add v4.4s, v22.4s , v24.4s
+ sub v22.4s, v22.4s , v24.4s
+
+ add v6.4s, v20.4s , v26.4s
+ sub v24.4s, v20.4s , v26.4s
+
+ add v10.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+ // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+
+
+
+
+
+ ld1 {v4.4h, v5.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],#16
+ sub x1,x1,#32
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+ umov x15,v26.d[0]
+ umov x16,v27.d[0]
+ umov x19,v28.d[0]
+ umov x20,v29.d[0]
+
+ trn1 v26.4h, v4.4h, v12.4h
+ trn2 v27.4h, v4.4h, v12.4h
+ trn1 v28.4h, v5.4h, v13.4h
+ trn2 v29.4h, v5.4h, v13.4h
+
+ trn1 v4.2s, v26.2s, v28.2s
+ trn2 v5.2s, v26.2s, v28.2s
+ trn1 v12.2s, v27.2s, v29.2s
+ trn2 v13.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v18.4h, v20.4h
+ trn2 v27.4h, v18.4h, v20.4h
+ trn1 v28.4h, v19.4h, v21.4h
+ trn2 v29.4h, v19.4h, v21.4h
+
+ trn1 v18.2s, v26.2s, v28.2s
+ trn2 v19.2s, v26.2s, v28.2s
+ trn1 v20.2s, v27.2s, v29.2s
+ trn2 v21.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v22.4h, v30.4h
+ trn2 v27.4h, v22.4h, v30.4h
+ trn1 v28.4h, v23.4h, v31.4h
+ trn2 v29.4h, v23.4h, v31.4h
+
+ trn1 v22.2s, v26.2s, v28.2s
+ trn2 v23.2s, v26.2s, v28.2s
+ trn1 v30.2s, v27.2s, v29.2s
+ trn2 v31.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v14.4h, v8.4h
+ trn2 v27.4h, v14.4h, v8.4h
+ trn1 v28.4h, v15.4h, v9.4h
+ trn2 v29.4h, v15.4h, v9.4h
+
+ trn1 v14.2s, v26.2s, v28.2s
+ trn2 v15.2s, v26.2s, v28.2s
+ trn1 v8.2s, v27.2s, v29.2s
+ trn2 v9.2s, v27.2s, v29.2s
+
+ mov v26.d[0],x15
+ mov v27.d[0],x16
+ mov v28.d[0],x19
+ mov v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+
+ st1 { v4.4h, v5.4h},[x1],#16
+ st1 { v12.4h, v13.4h},[x1],#16
+
+ st1 { v18.4h, v19.4h},[x1],#16
+ st1 { v20.4h, v21.4h},[x1],#16
+ st1 { v22.4h, v23.4h},[x1],#16
+ st1 { v30.4h, v31.4h},[x1],#16
+ st1 { v14.4h, v15.4h},[x1],#16
+ st1 { v8.4h, v9.4h},[x1],#16
+
+
+ subs x14,x14,#1
+ bne first_stage
+
+
+
+
+
+
+
+
+
+
+ mov x6,x7
+
+ ldp x8, x7,[sp],#16
+
+ mov x10,#16
+
+ cmp x12,x6
+ sub x20,x1,#128
+ csel x1, x20, x1,ge
+ bge label1
+
+ mov x19,#0xff00
+ cmp x12,x19
+ sub x20,x1,#256
+ csel x1, x20, x1,ge
+ bge label_2
+
+ sub x1,x1,#512
+ sub x20,x10,#0
+ neg x10, x20
+
+label_2:
+ add x9,x1,#128
+ add x11,x9,#128
+ add x0,x11,#128
+
+
+
+label1:
+// mov x6,x1
+
+
+ mov x14,#4
+ add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
+ add x5,x8,x8, lsl #1 //
+// add x0,x3,x7, lsl #1 @ x0 points to 3rd row of dest data
+// add x10,x7,x7, lsl #1 @
+
+
+
+
+second_stage:
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v6.4h, v7.4h},[x1],x10
+ cmp x12,x6
+ bge second_stage_process
+ ld1 {v4.4h, v5.4h},[x9],#16
+ ld1 {v8.4h, v9.4h},[x9],x10
+
+second_stage_process:
+
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smull v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v0.4h[2]
+ smull v14.4s, v10.4h, v0.4h[0]
+ smlal v14.4s, v11.4h, v1.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+
+ bge skip_last8rows_stage2_kernel1
+
+ smlal v24.4s, v8.4h, v1.4h[1]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v1.4h[3]
+ smlsl v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v2.4h[3]
+ smlsl v28.4s, v9.4h, v0.4h[3]
+ smlal v30.4s, v9.4h, v3.4h[3]
+
+
+ smlal v12.4s, v4.4h, v1.4h[0]
+ smlal v12.4s, v5.4h, v1.4h[2]
+ smlal v14.4s, v4.4h, v3.4h[0]
+ smlsl v14.4s, v5.4h, v3.4h[2]
+ smlsl v16.4s, v4.4h, v3.4h[0]
+ smlsl v16.4s, v5.4h, v0.4h[2]
+ smlsl v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v2.4h[2]
+
+ mov x19,#0xff00
+ cmp x12,x19
+ bge skip_last8rows_stage2_kernel1
+
+
+ ld1 {v10.4h, v11.4h},[x11],#16
+ ld1 {v6.4h, v7.4h},[x11],x10
+ ld1 {v4.4h, v5.4h},[x0],#16
+ ld1 {v8.4h, v9.4h},[x0],x10
+
+
+
+
+
+ smlal v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v1.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ smlal v24.4s, v8.4h, v3.4h[1]
+ smlsl v26.4s, v8.4h, v1.4h[3]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlsl v30.4s, v8.4h, v1.4h[1]
+
+
+ smlal v24.4s, v9.4h, v3.4h[3]
+ smlsl v26.4s, v9.4h, v3.4h[1]
+ smlal v28.4s, v9.4h, v2.4h[3]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+
+
+ smlal v12.4s, v10.4h, v0.4h[0]
+ smlal v12.4s, v11.4h, v2.4h[2]
+ smlal v12.4s, v4.4h, v3.4h[0]
+ smlal v12.4s, v5.4h, v3.4h[2]
+
+
+
+
+ smlsl v14.4s, v10.4h, v0.4h[0]
+ smlsl v14.4s, v11.4h, v0.4h[2]
+ smlsl v14.4s, v4.4h, v1.4h[0]
+ smlsl v14.4s, v5.4h, v2.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v3.4h[2]
+ smlal v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v1.4h[2]
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v1.4h[2]
+ smlsl v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v0.4h[2]
+
+
+
+
+
+
+skip_last8rows_stage2_kernel1:
+
+
+
+ add v20.4s, v12.4s , v24.4s
+ sub v22.4s, v12.4s , v24.4s
+
+ add v12.4s, v14.4s , v26.4s
+ sub v24.4s, v14.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+
+
+
+
+
+ sqrshrn v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ bge skip_stage2_kernel_load
+
+ //q2,q4,q6,q7 is used
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v6.4h, v7.4h},[x1],#16
+ ld1 {v4.4h, v5.4h},[x9],#16
+ ld1 {v8.4h, v9.4h},[x9],#16
+skip_stage2_kernel_load:
+ sub x1,x1,#32
+ st1 {v30.4h, v31.4h},[x1],#16
+ st1 {v18.4h, v19.4h},[x1],#16
+ sub x1,x1,#32
+
+ smull v24.4s, v6.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v3.4h[2]
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v2.4h[2]
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v1.4h[2]
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v0.4h[2]
+
+
+
+ cmp x12,x6
+ bge skip_last8rows_stage2_kernel2
+
+
+ smlsl v24.4s, v8.4h, v3.4h[1]
+ smlal v26.4s, v8.4h, v2.4h[1]
+ smlal v28.4s, v8.4h, v0.4h[1]
+ smlal v30.4s, v8.4h, v2.4h[3]
+
+
+ smlal v24.4s, v9.4h, v0.4h[1]
+ smlal v26.4s, v9.4h, v3.4h[1]
+ smlsl v28.4s, v9.4h, v1.4h[1]
+ smlsl v30.4s, v9.4h, v2.4h[1]
+
+
+
+ smlsl v22.4s, v4.4h, v1.4h[0]
+ smlal v22.4s, v5.4h, v2.4h[2]
+ smlsl v20.4s, v4.4h, v3.4h[0]
+ smlal v20.4s, v5.4h, v0.4h[2]
+ smlal v16.4s, v4.4h, v3.4h[0]
+ smlal v16.4s, v5.4h, v3.4h[2]
+ smlal v18.4s, v4.4h, v1.4h[0]
+ smlsl v18.4s, v5.4h, v1.4h[2]
+ mov x19,#0xff00
+ cmp x12,x19
+ bge skip_last8rows_stage2_kernel2
+
+ ld1 {v10.4h, v11.4h},[x11],#16
+ ld1 {v6.4h, v7.4h},[x11],#16
+ ld1 {v4.4h, v5.4h},[x0],#16
+ ld1 {v8.4h, v9.4h},[x0],#16
+
+ smlsl v24.4s, v6.4h, v3.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v6.4h, v2.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v7.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+ smlal v24.4s, v8.4h, v2.4h[3]
+ smlal v26.4s, v8.4h, v3.4h[3]
+ smlsl v28.4s, v8.4h, v2.4h[1]
+ smlal v30.4s, v8.4h, v0.4h[3]
+
+
+ smlal v24.4s, v9.4h, v1.4h[3]
+ smlsl v26.4s, v9.4h, v1.4h[1]
+ smlal v28.4s, v9.4h, v0.4h[3]
+ smlsl v30.4s, v9.4h, v0.4h[1]
+
+
+
+
+ smlal v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v1.4h[2]
+ smlsl v22.4s, v4.4h, v3.4h[0]
+ smlal v22.4s, v5.4h, v0.4h[2]
+
+
+
+ smlsl v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+ smlal v20.4s, v4.4h, v1.4h[0]
+ smlsl v20.4s, v5.4h, v1.4h[2]
+
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+ smlsl v16.4s, v4.4h, v1.4h[0]
+ smlal v16.4s, v5.4h, v2.4h[2]
+
+
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+ smlal v18.4s, v4.4h, v3.4h[0]
+ smlsl v18.4s, v5.4h, v3.4h[2]
+
+
+skip_last8rows_stage2_kernel2:
+
+
+
+ add v4.4s, v22.4s , v24.4s
+ sub v22.4s, v22.4s , v24.4s
+
+ add v6.4s, v20.4s , v26.4s
+ sub v24.4s, v20.4s , v26.4s
+
+ add v10.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ ld1 {v4.4h, v5.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],#16
+
+
+
+ // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
+
+//d4=x0
+//d12=x1
+//d5=x2
+//d13=x3
+
+//d18=x4
+//d20=x5
+//d19=x6
+//d21=x7
+
+//d22=x8
+//d30=x9
+//d23=x10
+//d31=x11
+
+//d14=x12
+//d8=x13
+//d15=x14
+//d9=x15
+
+ umov x15,v26.d[0]
+ umov x16,v27.d[0]
+ umov x19,v28.d[0]
+ umov x20,v29.d[0]
+
+ trn1 v26.4h, v4.4h, v12.4h
+ trn2 v27.4h, v4.4h, v12.4h
+ trn1 v28.4h, v5.4h, v13.4h
+ trn2 v29.4h, v5.4h, v13.4h
+
+ trn1 v4.2s, v26.2s, v28.2s
+ trn2 v5.2s, v26.2s, v28.2s
+ trn1 v12.2s, v27.2s, v29.2s
+ trn2 v13.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v18.4h, v20.4h
+ trn2 v27.4h, v18.4h, v20.4h
+ trn1 v28.4h, v19.4h, v21.4h
+ trn2 v29.4h, v19.4h, v21.4h
+
+ trn1 v18.2s, v26.2s, v28.2s
+ trn2 v19.2s, v26.2s, v28.2s
+ trn1 v20.2s, v27.2s, v29.2s
+ trn2 v21.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v22.4h, v30.4h
+ trn2 v27.4h, v22.4h, v30.4h
+ trn1 v28.4h, v23.4h, v31.4h
+ trn2 v29.4h, v23.4h, v31.4h
+
+ trn1 v22.2s, v26.2s, v28.2s
+ trn2 v23.2s, v26.2s, v28.2s
+ trn1 v30.2s, v27.2s, v29.2s
+ trn2 v31.2s, v27.2s, v29.2s
+
+ trn1 v26.4h, v14.4h, v8.4h
+ trn2 v27.4h, v14.4h, v8.4h
+ trn1 v28.4h, v15.4h, v9.4h
+ trn2 v29.4h, v15.4h, v9.4h
+
+ trn1 v14.2s, v26.2s, v28.2s
+ trn2 v15.2s, v26.2s, v28.2s
+ trn1 v8.2s, v27.2s, v29.2s
+ trn2 v9.2s, v27.2s, v29.2s
+
+ mov v26.d[0],x15
+ mov v27.d[0],x16
+ mov v28.d[0],x19
+ mov v29.d[0],x20
+
+// d4 =x0 1- 4 values
+// d5 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+
+// d18 =x0 5- 8 values
+// d19 =x2 5- 8 values
+// d20=x1 5- 8 values
+// d21=x3 5- 8 values
+
+// d22 =x0 9- 12 values
+// d23 =x2 9- 12 values
+// d30=x1 9- 12 values
+// d31=x3 9- 12 values
+
+// d14 =x0 13-16 values
+// d15 =x2 13- 16 values
+// d8=x1 13- 16 values
+// d9=x3 13- 16 values
+
+ // swapping v5 and v15
+ mov v5.d[1],v5.d[0]
+ mov v5.d[0],v18.d[0]
+ mov v18.d[0],v5.d[1]
+ // swapping v23 and v14
+ mov v23.d[1],v23.d[0]
+ mov v23.d[0],v14.d[0]
+ mov v14.d[0],v23.d[1]
+ // swapping v13 and v20
+ mov v13.d[1],v13.d[0]
+ mov v13.d[0],v20.d[0]
+ mov v20.d[0],v13.d[1]
+ // swapping v31 and v8
+ mov v31.d[1],v31.d[0]
+ mov v31.d[0],v8.d[0]
+ mov v8.d[0],v31.d[1]
+
+// q2: x0 1-8 values
+// q11: x0 9-16 values
+// q9 : x2 1-8 values
+// q7 : x2 9-16 values
+// q6 : x1 1- 8 values
+// q10: x3 1-8 values
+// q15: x1 9-16 values
+// q4: x3 9-16 values
+
+
+// registers free: q8,q14,q12,q13
+
+
+ ld1 {v16.8b, v17.8b},[x2],x8
+ ld1 {v28.8b, v29.8b},[x2],x5
+ ld1 {v24.8b, v25.8b},[x4],x8
+ ld1 {v26.8b, v27.8b},[x4],x5
+
+ mov v4.d[1] ,v5.d[0]
+ mov v22.d[1] ,v23.d[0]
+ mov v12.d[1] ,v13.d[0]
+ mov v30.d[1] ,v31.d[0]
+ mov v18.d[1] ,v19.d[0]
+ mov v14.d[1] ,v15.d[0]
+ mov v20.d[1] ,v21.d[0]
+ mov v8.d[1] ,v9.d[0]
+
+ uaddw v4.8h, v4.8h , v16.8b
+ uaddw v22.8h, v22.8h , v17.8b
+ uaddw v12.8h, v12.8h , v28.8b
+ uaddw v30.8h, v30.8h , v29.8b
+ uaddw v18.8h, v18.8h , v24.8b
+ uaddw v14.8h, v14.8h , v25.8b
+ uaddw v20.8h, v20.8h , v26.8b
+ uaddw v8.8h, v8.8h , v27.8b
+
+
+ sqxtun v16.8b, v4.8h
+ sqxtun v17.8b, v22.8h
+ sqxtun v28.8b, v12.8h
+ sqxtun v29.8b, v30.8h
+ sqxtun v24.8b, v18.8h
+ sqxtun v25.8b, v14.8h
+ sqxtun v26.8b, v20.8h
+ sqxtun v27.8b, v8.8h
+
+
+
+ st1 {v16.8b, v17.8b},[x3],x7
+ st1 {v28.8b, v29.8b},[x3],x7
+ st1 {v24.8b, v25.8b},[x3],x7
+ st1 {v26.8b, v27.8b},[x3],x7
+
+ subs x14,x14,#1
+
+
+
+ bne second_stage
+
+
+// sub sp,sp,#40
+ // ldmfd sp!,{x4-x12,pc}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_itrans_recon_32x32.s b/common/arm64/ihevc_itrans_recon_32x32.s
new file mode 100644
index 0000000..6f40747
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_32x32.s
@@ -0,0 +1,3053 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// * ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// * contains function definitions for single stage inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// * - ihevc_itrans_recon_32x32()
+// *
+// * @remarks
+// * the input buffer is being corrupted
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * this function performs inverse transform and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// * performs inverse transform and adds the prediction data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// * input 16x16 coefficients
+// *
+// * @param[in] pi2_tmp
+// * temporary 16x16 buffer for storing inverse
+// *
+// * transform
+// * 1st stage output
+// *
+// * @param[in] pu1_pred
+// * prediction 16x16 block
+// *
+// * @param[out] pu1_dst
+// * output 8x8 block
+// *
+// * @param[in] src_strd
+// * input stride
+// *
+// * @param[in] pred_strd
+// * prediction stride
+// *
+// * @param[in] dst_strd
+// * output stride
+// *
+// * @param[in] shift
+// * output shift
+// *
+// * @param[in] x12
+// * zero columns in pi2_src
+// *
+// * @returns void
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_32x32(word16 *pi2_src,
+// word16 *pi2_tmp,
+// uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 x12
+// word32 x11 )
+
+//**************variables vs registers*************************
+// x0 => *pi2_src
+// x1 => *pi2_tmp
+// x2 => *pu1_pred
+// x3 => *pu1_dst
+// src_strd
+// pred_strd
+// dst_strd
+// x12
+// x11
+
+
+//d0[0]= 64 d2[0]=83
+//d0[1]= 90 d2[1]=82
+//d0[2]= 90 d2[2]=80
+//d0[3]= 90 d2[3]=78
+//d1[0]= 89 d3[0]=75
+//d1[1]= 88 d3[1]=73
+//d1[2]= 87 d3[2]=70
+//d1[3]= 85 d3[3]=67
+
+//d4[0]= 64 d6[0]=36
+//d4[1]= 61 d6[1]=31
+//d4[2]= 57 d6[2]=25
+//d4[3]= 54 d6[3]=22
+//d5[0]= 50 d7[0]=18
+//d5[1]= 46 d7[1]=13
+//d5[2]= 43 d7[2]=9
+//d5[3]= 38 d7[3]=4
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+//#define zero_cols x12
+//#define zero_rows x11
+
+.globl ihevc_itrans_recon_32x32_av8
+
+.extern g_ai2_ihevc_trans_32_transpose
+
+x5_addr: .word 0xfffff000
+x9_addr: .word 0xffff0000
+
+.type ihevc_itrans_recon_32x32_av8, %function
+
+ihevc_itrans_recon_32x32_av8:
+
+ ldr w11, [sp]
+
+// stmfd sp!,{x0-x12,x14}
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x0, x1,[sp,#-16]!
+ stp x5, x6,[sp,#-16]!
+
+//ldr x8,[sp,#56] @ prediction stride
+//ldr x7,[sp,#64] @ destination stride
+ mov x6, x4 // src stride
+ mov x12, x7
+ lsl x6, x6, #1 // x sizeof(word16)
+ add x10,x6,x6, lsl #1 // 3 rows
+
+
+ mov x8,x0
+
+ adrp x14, :got:g_ai2_ihevc_trans_32_transpose
+ ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose]
+
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
+
+//registers which are free
+// x10,x9,x11,x12
+ mov x9,#0xffffff00
+ mov x10,#0xfffffff0
+ ldr w5, x5_addr
+ ldr w7, x9_addr
+ cmp x12,x10
+ mov x20,#1
+ csel x14, x20, x14,hs
+ bhs stage1
+
+
+ cmp x12,x9
+ mov x20,#2
+ csel x14, x20, x14,hs
+ bhs stage1
+
+ cmp x12,x5
+ mov x20,#3
+ csel x14, x20, x14,hs
+ bhs stage1
+
+ cmp x12,x7
+ mov x20,#4
+ csel x14, x20, x14,hs
+
+ mov x14,#8
+ b stage1
+//.ltorg
+
+
+dct_stage1:
+ add x8,x8,#8
+ mov x0,x8
+
+stage1:
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+ smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlal v20.4s, v11.4h, v0.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlal v22.4s, v11.4h, v1.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+ cmp x11,x10
+ bhs shift1
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+
+
+ smlal v24.4s, v14.4h, v1.4h[1]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlal v28.4s, v14.4h, v6.4h[1]
+ smlsl v30.4s, v14.4h, v7.4h[1]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlal v26.4s, v15.4h, v5.4h[1]
+ smlsl v28.4s, v15.4h, v7.4h[1]
+ smlsl v30.4s, v15.4h, v3.4h[3]
+
+
+ smlal v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v1.4h[2]
+ smlal v22.4s, v12.4h, v3.4h[0]
+ smlal v22.4s, v13.4h, v4.4h[2]
+ smlal v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v7.4h[2]
+ smlal v18.4s, v12.4h, v7.4h[0]
+ smlsl v18.4s, v13.4h, v5.4h[2]
+
+ cmp x11,x9
+ bhs shift1
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+ smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v2.4h[0]
+ smlal v20.4s, v11.4h, v2.4h[2]
+
+
+ smlal v22.4s, v10.4h, v6.4h[0]
+ smlal v22.4s, v11.4h, v7.4h[2]
+
+ smlsl v16.4s, v10.4h, v6.4h[0]
+ smlsl v16.4s, v11.4h, v3.4h[2]
+
+ smlsl v18.4s, v10.4h, v2.4h[0]
+ smlsl v18.4s, v11.4h, v1.4h[2]
+
+ cmp x11,x5
+ bhs shift1
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+
+
+
+
+ smlal v24.4s, v14.4h, v3.4h[1]
+ smlsl v26.4s, v14.4h, v6.4h[1]
+ smlsl v28.4s, v14.4h, v0.4h[1]
+ smlsl v30.4s, v14.4h, v6.4h[3]
+
+
+ smlal v24.4s, v15.4h, v3.4h[3]
+ smlsl v26.4s, v15.4h, v4.4h[3]
+ smlsl v28.4s, v15.4h, v2.4h[3]
+ smlal v30.4s, v15.4h, v5.4h[3]
+
+
+ smlal v20.4s, v12.4h, v3.4h[0]
+ smlal v20.4s, v13.4h, v3.4h[2]
+ smlsl v22.4s, v12.4h, v7.4h[0]
+ smlsl v22.4s, v13.4h, v5.4h[2]
+ smlsl v16.4s, v12.4h, v1.4h[0]
+ smlsl v16.4s, v13.4h, v1.4h[2]
+ smlsl v18.4s, v12.4h, v5.4h[0]
+ smlal v18.4s, v13.4h, v7.4h[2]
+
+ cmp x11,x7
+ bhs shift1
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+ smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlal v20.4s, v11.4h, v4.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v2.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v6.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v0.4h[2]
+
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+ smlal v24.4s, v14.4h, v5.4h[1]
+ smlsl v26.4s, v14.4h, v0.4h[2]
+ smlal v28.4s, v14.4h, v5.4h[3]
+ smlal v30.4s, v14.4h, v4.4h[3]
+
+
+ smlal v24.4s, v15.4h, v5.4h[3]
+ smlsl v26.4s, v15.4h, v1.4h[1]
+ smlal v28.4s, v15.4h, v3.4h[1]
+ smlsl v30.4s, v15.4h, v7.4h[3]
+
+
+ smlal v20.4s, v12.4h, v5.4h[0]
+ smlal v20.4s, v13.4h, v5.4h[2]
+ smlsl v22.4s, v12.4h, v1.4h[0]
+ smlsl v22.4s, v13.4h, v0.4h[2]
+ smlal v16.4s, v12.4h, v7.4h[0]
+ smlal v16.4s, v13.4h, v4.4h[2]
+ smlal v18.4s, v12.4h, v3.4h[0]
+ smlal v18.4s, v13.4h, v6.4h[2]
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+
+
+
+
+ smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v6.4h[0]
+ smlal v20.4s, v11.4h, v6.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v2.4h[0]
+ smlsl v22.4s, v11.4h, v3.4h[2]
+
+ smlal v16.4s, v10.4h, v2.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+
+ smlsl v18.4s, v10.4h, v6.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+ smlal v24.4s, v14.4h, v7.4h[1]
+ smlsl v26.4s, v14.4h, v5.4h[3]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlsl v30.4s, v14.4h, v2.4h[3]
+
+
+ smlal v24.4s, v15.4h, v7.4h[3]
+ smlsl v26.4s, v15.4h, v7.4h[1]
+ smlal v28.4s, v15.4h, v6.4h[3]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlal v20.4s, v12.4h, v7.4h[0]
+ smlal v20.4s, v13.4h, v7.4h[2]
+ smlsl v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v6.4h[2]
+ smlal v16.4s, v12.4h, v3.4h[0]
+ smlal v16.4s, v13.4h, v5.4h[2]
+ smlsl v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v4.4h[2]
+
+
+
+shift1:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+ // registers used q15,q14,q6,q7
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+
+// d30 =x0 1- 4 values
+// d31 =x2 1- 4 values
+// d12=x1 1- 4 values
+// d13=x3 1- 4 values
+// d14 =x0 28-31 values
+// d15 =x2 28- 31 values
+// d18=x1 28- 31 values
+// d19=x3 28- 31 values
+
+
+
+ st1 { v30.4h, v31.4h},[x1],#16
+ st1 { v12.4h, v13.4h},[x1],#16
+ add x1,x1,#192
+ st1 { v14.4h, v15.4h},[x1],#16
+ st1 { v18.4h, v19.4h},[x1],#16
+ sub x1,x1,#224
+
+ mov x0,x8
+
+
+
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+
+ smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlal v20.4s, v11.4h, v4.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlal v22.4s, v11.4h, v5.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v6.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v7.4h[2]
+ cmp x11,x10
+ bhs shift2
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+ smlsl v24.4s, v14.4h, v4.4h[3]
+ smlsl v26.4s, v14.4h, v2.4h[1]
+ smlsl v28.4s, v14.4h, v0.4h[1]
+ smlsl v30.4s, v14.4h, v2.4h[3]
+
+
+ smlsl v24.4s, v15.4h, v0.4h[3]
+ smlsl v26.4s, v15.4h, v3.4h[1]
+ smlsl v28.4s, v15.4h, v6.4h[3]
+ smlal v30.4s, v15.4h, v5.4h[3]
+
+
+ smlsl v20.4s, v12.4h, v7.4h[0]
+ smlsl v20.4s, v13.4h, v2.4h[2]
+ smlsl v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v0.4h[2]
+ smlsl v16.4s, v12.4h, v3.4h[0]
+ smlsl v16.4s, v13.4h, v3.4h[2]
+ smlsl v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v6.4h[2]
+
+ cmp x11,x9
+ bhs shift2
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+
+
+
+
+ smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v2.4h[0]
+ smlsl v20.4s, v11.4h, v6.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v6.4h[0]
+ smlal v22.4s, v11.4h, v4.4h[2]
+
+ smlal v16.4s, v10.4h, v6.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+
+ smlal v18.4s, v10.4h, v2.4h[0]
+ smlal v18.4s, v11.4h, v5.4h[2]
+
+ cmp x11,x5
+ bhs shift2
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+ smlal v24.4s, v14.4h, v2.4h[3]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlsl v28.4s, v14.4h, v5.4h[3]
+ smlsl v30.4s, v14.4h, v0.4h[3]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlsl v26.4s, v15.4h, v6.4h[3]
+ smlsl v28.4s, v15.4h, v0.4h[3]
+ smlal v30.4s, v15.4h, v7.4h[3]
+
+
+ smlal v20.4s, v12.4h, v5.4h[0]
+ smlal v20.4s, v13.4h, v0.4h[2]
+ smlal v22.4s, v12.4h, v1.4h[0]
+ smlal v22.4s, v13.4h, v6.4h[2]
+ smlal v16.4s, v12.4h, v7.4h[0]
+ smlsl v16.4s, v13.4h, v2.4h[2]
+ smlsl v18.4s, v12.4h, v3.4h[0]
+ smlsl v18.4s, v13.4h, v4.4h[2]
+
+
+ cmp x11,x7
+ bhs shift2
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+
+
+
+
+ smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v7.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v1.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v5.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+ smlsl v24.4s, v14.4h, v0.4h[1]
+ smlal v26.4s, v14.4h, v6.4h[1]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlsl v30.4s, v14.4h, v1.4h[1]
+
+
+ smlsl v24.4s, v15.4h, v3.4h[3]
+ smlal v26.4s, v15.4h, v0.4h[1]
+ smlsl v28.4s, v15.4h, v5.4h[1]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v3.4h[0]
+ smlsl v20.4s, v13.4h, v1.4h[2]
+ smlsl v22.4s, v12.4h, v7.4h[0]
+ smlal v22.4s, v13.4h, v3.4h[2]
+ smlal v16.4s, v12.4h, v1.4h[0]
+ smlal v16.4s, v13.4h, v7.4h[2]
+ smlsl v18.4s, v12.4h, v5.4h[0]
+ smlsl v18.4s, v13.4h, v2.4h[2]
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+
+ smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v6.4h[0]
+ smlal v20.4s, v11.4h, v5.4h[2]
+
+
+ smlal v22.4s, v10.4h, v2.4h[0]
+ smlal v22.4s, v11.4h, v7.4h[2]
+
+ smlsl v16.4s, v10.4h, v2.4h[0]
+ smlsl v16.4s, v11.4h, v4.4h[2]
+
+ smlal v18.4s, v10.4h, v6.4h[0]
+ smlal v18.4s, v11.4h, v1.4h[2]
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+ smlal v24.4s, v14.4h, v1.4h[1]
+ smlsl v26.4s, v14.4h, v0.4h[3]
+ smlal v28.4s, v14.4h, v1.4h[3]
+ smlsl v30.4s, v14.4h, v3.4h[1]
+
+
+ smlal v24.4s, v15.4h, v5.4h[3]
+ smlsl v26.4s, v15.4h, v5.4h[1]
+ smlal v28.4s, v15.4h, v4.4h[3]
+ smlsl v30.4s, v15.4h, v4.4h[1]
+
+
+ smlal v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v3.4h[2]
+ smlsl v22.4s, v12.4h, v3.4h[0]
+ smlsl v22.4s, v13.4h, v2.4h[2]
+ smlal v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v1.4h[2]
+ smlsl v18.4s, v12.4h, v7.4h[0]
+ smlsl v18.4s, v13.4h, v0.4h[2]
+
+shift2:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+
+ st1 { v30.4h, v31.4h},[x1],#16
+ st1 { v12.4h, v13.4h},[x1],#16
+ add x1,x1,#128
+ st1 { v14.4h, v15.4h},[x1],#16
+ st1 { v18.4h, v19.4h},[x1],#16
+ sub x1,x1,#160
+ mov x0,x8
+
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+ smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v7.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v6.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v5.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v4.4h[2]
+
+ cmp x11,x10
+ bhs shift3
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+ smlsl v24.4s, v14.4h, v5.4h[1]
+ smlsl v26.4s, v14.4h, v7.4h[3]
+ smlal v28.4s, v14.4h, v5.4h[3]
+ smlal v30.4s, v14.4h, v3.4h[1]
+
+
+ smlal v24.4s, v15.4h, v2.4h[1]
+ smlal v26.4s, v15.4h, v1.4h[1]
+ smlal v28.4s, v15.4h, v4.4h[3]
+ smlsl v30.4s, v15.4h, v7.4h[3]
+
+
+ smlsl v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v6.4h[2]
+ smlsl v22.4s, v12.4h, v3.4h[0]
+ smlal v22.4s, v13.4h, v3.4h[2]
+ smlsl v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v0.4h[2]
+ smlsl v18.4s, v12.4h, v7.4h[0]
+ smlal v18.4s, v13.4h, v2.4h[2]
+
+ cmp x11,x9
+ bhs shift3
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+ smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v2.4h[0]
+ smlsl v20.4s, v11.4h, v5.4h[2]
+
+
+ smlal v22.4s, v10.4h, v6.4h[0]
+ smlsl v22.4s, v11.4h, v0.4h[2]
+
+ smlsl v16.4s, v10.4h, v6.4h[0]
+ smlsl v16.4s, v11.4h, v4.4h[2]
+
+ smlsl v18.4s, v10.4h, v2.4h[0]
+ smlal v18.4s, v11.4h, v6.4h[2]
+
+ cmp x11,x5
+ bhs shift3
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+
+ smlsl v24.4s, v14.4h, v7.4h[1]
+ smlal v26.4s, v14.4h, v2.4h[1]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlsl v30.4s, v14.4h, v5.4h[1]
+
+
+ smlal v24.4s, v15.4h, v0.4h[3]
+ smlal v26.4s, v15.4h, v7.4h[1]
+ smlsl v28.4s, v15.4h, v1.4h[1]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v3.4h[0]
+ smlal v20.4s, v13.4h, v4.4h[2]
+ smlal v22.4s, v12.4h, v7.4h[0]
+ smlal v22.4s, v13.4h, v2.4h[2]
+ smlal v16.4s, v12.4h, v1.4h[0]
+ smlsl v16.4s, v13.4h, v6.4h[2]
+ smlal v18.4s, v12.4h, v5.4h[0]
+ smlsl v18.4s, v13.4h, v0.4h[2]
+
+
+ cmp x11,x7
+ bhs shift3
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+ smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v5.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v1.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v7.4h[2]
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+ smlal v24.4s, v14.4h, v6.4h[3]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlsl v28.4s, v14.4h, v1.4h[3]
+ smlal v30.4s, v14.4h, v7.4h[1]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlsl v26.4s, v15.4h, v2.4h[3]
+ smlal v28.4s, v15.4h, v7.4h[1]
+ smlal v30.4s, v15.4h, v4.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v5.4h[0]
+ smlal v20.4s, v13.4h, v2.4h[2]
+ smlal v22.4s, v12.4h, v1.4h[0]
+ smlsl v22.4s, v13.4h, v7.4h[2]
+ smlsl v16.4s, v12.4h, v7.4h[0]
+ smlsl v16.4s, v13.4h, v3.4h[2]
+ smlsl v18.4s, v12.4h, v3.4h[0]
+ smlal v18.4s, v13.4h, v1.4h[2]
+
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+
+ smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v6.4h[0]
+ smlsl v20.4s, v11.4h, v1.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v2.4h[0]
+ smlal v22.4s, v11.4h, v4.4h[2]
+
+ smlal v16.4s, v10.4h, v2.4h[0]
+ smlsl v16.4s, v11.4h, v7.4h[2]
+
+ smlsl v18.4s, v10.4h, v6.4h[0]
+ smlsl v18.4s, v11.4h, v5.4h[2]
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+ smlal v24.4s, v14.4h, v4.4h[3]
+ smlsl v26.4s, v14.4h, v6.4h[1]
+ smlal v28.4s, v14.4h, v7.4h[3]
+ smlal v30.4s, v14.4h, v6.4h[3]
+
+
+ smlal v24.4s, v15.4h, v3.4h[3]
+ smlsl v26.4s, v15.4h, v3.4h[1]
+ smlal v28.4s, v15.4h, v2.4h[3]
+ smlsl v30.4s, v15.4h, v2.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v7.4h[0]
+ smlal v20.4s, v13.4h, v0.4h[2]
+ smlal v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v1.4h[2]
+ smlsl v16.4s, v12.4h, v3.4h[0]
+ smlal v16.4s, v13.4h, v2.4h[2]
+ smlal v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v3.4h[2]
+
+shift3:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+ st1 { v30.4h, v31.4h},[x1],#16
+ st1 { v12.4h, v13.4h},[x1],#16
+ add x1,x1,#64
+ st1 { v14.4h, v15.4h},[x1],#16
+ st1 { v18.4h, v19.4h},[x1],#16
+ sub x1,x1,#96
+
+ mov x0,x8
+
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+ smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v2.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v1.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v0.4h[2]
+
+ cmp x11,x10
+ bhs shift4
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+
+ smlal v24.4s, v14.4h, v0.4h[1]
+ smlal v26.4s, v14.4h, v1.4h[3]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlal v30.4s, v14.4h, v6.4h[3]
+
+
+ smlsl v24.4s, v15.4h, v4.4h[1]
+ smlsl v26.4s, v15.4h, v0.4h[3]
+ smlsl v28.4s, v15.4h, v2.4h[3]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlal v20.4s, v12.4h, v7.4h[0]
+ smlal v20.4s, v13.4h, v5.4h[2]
+ smlal v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v7.4h[2]
+ smlal v16.4s, v12.4h, v3.4h[0]
+ smlsl v16.4s, v13.4h, v4.4h[2]
+ smlal v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v1.4h[2]
+
+ cmp x11,x9
+ bhs shift4
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+ smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v2.4h[0]
+ smlal v20.4s, v11.4h, v1.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v6.4h[0]
+ smlal v22.4s, v11.4h, v3.4h[2]
+
+ smlal v16.4s, v10.4h, v6.4h[0]
+ smlsl v16.4s, v11.4h, v7.4h[2]
+
+ smlal v18.4s, v10.4h, v2.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+
+ cmp x11,x5
+ bhs shift4
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+
+ smlsl v24.4s, v14.4h, v1.4h[1]
+ smlsl v26.4s, v14.4h, v7.4h[3]
+ smlal v28.4s, v14.4h, v1.4h[3]
+ smlal v30.4s, v14.4h, v4.4h[3]
+
+
+ smlal v24.4s, v15.4h, v2.4h[1]
+ smlal v26.4s, v15.4h, v5.4h[1]
+ smlsl v28.4s, v15.4h, v3.4h[1]
+ smlsl v30.4s, v15.4h, v4.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v5.4h[0]
+ smlsl v20.4s, v13.4h, v7.4h[2]
+ smlsl v22.4s, v12.4h, v1.4h[0]
+ smlal v22.4s, v13.4h, v1.4h[2]
+ smlsl v16.4s, v12.4h, v7.4h[0]
+ smlal v16.4s, v13.4h, v5.4h[2]
+ smlal v18.4s, v12.4h, v3.4h[0]
+ smlsl v18.4s, v13.4h, v3.4h[2]
+
+ cmp x11,x7
+ bhs shift4
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+ smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v0.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlal v22.4s, v11.4h, v6.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v4.4h[2]
+
+
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+
+
+ smlal v24.4s, v14.4h, v3.4h[1]
+ smlsl v26.4s, v14.4h, v2.4h[1]
+ smlal v28.4s, v14.4h, v7.4h[3]
+ smlal v30.4s, v14.4h, v2.4h[3]
+
+
+ smlsl v24.4s, v15.4h, v0.4h[3]
+ smlal v26.4s, v15.4h, v4.4h[3]
+ smlal v28.4s, v15.4h, v6.4h[3]
+ smlsl v30.4s, v15.4h, v2.4h[1]
+
+
+ smlal v20.4s, v12.4h, v3.4h[0]
+ smlsl v20.4s, v13.4h, v6.4h[2]
+ smlal v22.4s, v12.4h, v7.4h[0]
+ smlsl v22.4s, v13.4h, v4.4h[2]
+ smlsl v16.4s, v12.4h, v1.4h[0]
+ smlal v16.4s, v13.4h, v0.4h[2]
+ smlal v18.4s, v12.4h, v5.4h[0]
+ smlsl v18.4s, v13.4h, v5.4h[2]
+
+
+ ld1 {v10.4h},[x0],x6
+ ld1 {v8.4h},[x0],x6
+ ld1 {v11.4h},[x0],x6
+ ld1 {v9.4h},[x0],x6
+
+
+
+
+
+ smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v6.4h[0]
+ smlal v20.4s, v11.4h, v2.4h[2]
+
+
+ smlal v22.4s, v10.4h, v2.4h[0]
+ smlsl v22.4s, v11.4h, v0.4h[2]
+
+ smlsl v16.4s, v10.4h, v2.4h[0]
+ smlal v16.4s, v11.4h, v3.4h[2]
+
+ smlal v18.4s, v10.4h, v6.4h[0]
+ smlsl v18.4s, v11.4h, v6.4h[2]
+
+
+ ld1 {v12.4h},[x0],x6
+ ld1 {v14.4h},[x0],x6
+ ld1 {v13.4h},[x0],x6
+ ld1 {v15.4h},[x0],x6
+
+
+
+
+ smlsl v24.4s, v14.4h, v5.4h[1]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlsl v28.4s, v14.4h, v2.4h[1]
+ smlal v30.4s, v14.4h, v0.4h[3]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlsl v26.4s, v15.4h, v1.4h[1]
+ smlal v28.4s, v15.4h, v0.4h[3]
+ smlsl v30.4s, v15.4h, v0.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v4.4h[2]
+ smlal v22.4s, v12.4h, v3.4h[0]
+ smlsl v22.4s, v13.4h, v5.4h[2]
+ smlsl v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v6.4h[2]
+ smlal v18.4s, v12.4h, v7.4h[0]
+ smlsl v18.4s, v13.4h, v7.4h[2]
+
+shift4:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+
+ st1 { v30.4h, v31.4h},[x1],#16
+ st1 { v12.4h, v13.4h},[x1],#16
+ st1 { v14.4h, v15.4h},[x1],#16
+ st1 { v18.4h, v19.4h},[x1],#16
+
+ add x1,x1,#96
+
+ subs x14,x14,#1
+ bne dct_stage1
+second_stage_dct:
+// mov x0,x1
+ ldp x8, x7,[sp],#16
+ ldp x0, x1,[sp],#16
+
+// add x4,x2,x8, lsl #1 @ x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
+// add x5,x8,x8, lsl #1 @
+// sub x0,x0,#512
+ mov x11,#0xfffffff0
+ mov x5, #0xffffff00
+ ldr w6, x5_addr
+ ldr w9, x9_addr
+// sub x1,x1,#2048
+ mov x4,x1
+ mov x10,#240
+ mov x14,#8
+ b stage2
+
+// registers free :
+
+// arm registers used
+// x8 : predicition stride
+// x7 : destination stride
+// x1: temp buffer
+// x2 : pred buffer
+// x3 : destination buffer
+// x14 : loop counter
+//x0 : scratch buffer
+//x10 : used as stride
+// x4 : used to store the initial address
+//x12 : zero cols
+// x11 : 0xfffffff0
+// x5 : 0xffffff00
+dct_stage2:
+ add x4,x4,#32
+ mov x1,x4
+stage2:
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+ smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlal v20.4s, v11.4h, v0.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlal v22.4s, v11.4h, v1.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+ cmp x12,x11
+ bhs stage2_shift1
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+ smlal v24.4s, v14.4h, v1.4h[1]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlal v28.4s, v14.4h, v6.4h[1]
+ smlsl v30.4s, v14.4h, v7.4h[1]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlal v26.4s, v15.4h, v5.4h[1]
+ smlsl v28.4s, v15.4h, v7.4h[1]
+ smlsl v30.4s, v15.4h, v3.4h[3]
+
+
+ smlal v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v1.4h[2]
+ smlal v22.4s, v12.4h, v3.4h[0]
+ smlal v22.4s, v13.4h, v4.4h[2]
+ smlal v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v7.4h[2]
+ smlal v18.4s, v12.4h, v7.4h[0]
+ smlsl v18.4s, v13.4h, v5.4h[2]
+ cmp x12,x5
+ bhs stage2_shift1
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+ smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v2.4h[0]
+ smlal v20.4s, v11.4h, v2.4h[2]
+
+
+ smlal v22.4s, v10.4h, v6.4h[0]
+ smlal v22.4s, v11.4h, v7.4h[2]
+
+ smlsl v16.4s, v10.4h, v6.4h[0]
+ smlsl v16.4s, v11.4h, v3.4h[2]
+
+ smlsl v18.4s, v10.4h, v2.4h[0]
+ smlsl v18.4s, v11.4h, v1.4h[2]
+
+ cmp x12,x6
+ bhs stage2_shift1
+
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+ smlal v24.4s, v14.4h, v3.4h[1]
+ smlsl v26.4s, v14.4h, v6.4h[1]
+ smlsl v28.4s, v14.4h, v0.4h[1]
+ smlsl v30.4s, v14.4h, v6.4h[3]
+
+
+ smlal v24.4s, v15.4h, v3.4h[3]
+ smlsl v26.4s, v15.4h, v4.4h[3]
+ smlsl v28.4s, v15.4h, v2.4h[3]
+ smlal v30.4s, v15.4h, v5.4h[3]
+
+
+ smlal v20.4s, v12.4h, v3.4h[0]
+ smlal v20.4s, v13.4h, v3.4h[2]
+ smlsl v22.4s, v12.4h, v7.4h[0]
+ smlsl v22.4s, v13.4h, v5.4h[2]
+ smlsl v16.4s, v12.4h, v1.4h[0]
+ smlsl v16.4s, v13.4h, v1.4h[2]
+ smlsl v18.4s, v12.4h, v5.4h[0]
+ smlal v18.4s, v13.4h, v7.4h[2]
+
+ cmp x12,x9
+ bhs stage2_shift1
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+ smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlal v20.4s, v11.4h, v4.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v2.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v6.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v0.4h[2]
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+ smlal v24.4s, v14.4h, v5.4h[1]
+ smlsl v26.4s, v14.4h, v0.4h[2]
+ smlal v28.4s, v14.4h, v5.4h[3]
+ smlal v30.4s, v14.4h, v4.4h[3]
+
+
+ smlal v24.4s, v15.4h, v5.4h[3]
+ smlsl v26.4s, v15.4h, v1.4h[1]
+ smlal v28.4s, v15.4h, v3.4h[1]
+ smlsl v30.4s, v15.4h, v7.4h[3]
+
+
+ smlal v20.4s, v12.4h, v5.4h[0]
+ smlal v20.4s, v13.4h, v5.4h[2]
+ smlsl v22.4s, v12.4h, v1.4h[0]
+ smlsl v22.4s, v13.4h, v0.4h[2]
+ smlal v16.4s, v12.4h, v7.4h[0]
+ smlal v16.4s, v13.4h, v4.4h[2]
+ smlal v18.4s, v12.4h, v3.4h[0]
+ smlal v18.4s, v13.4h, v6.4h[2]
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+
+
+ smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v6.4h[0]
+ smlal v20.4s, v11.4h, v6.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v2.4h[0]
+ smlsl v22.4s, v11.4h, v3.4h[2]
+
+ smlal v16.4s, v10.4h, v2.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+
+ smlsl v18.4s, v10.4h, v6.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+ smlal v24.4s, v14.4h, v7.4h[1]
+ smlsl v26.4s, v14.4h, v5.4h[3]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlsl v30.4s, v14.4h, v2.4h[3]
+
+
+ smlal v24.4s, v15.4h, v7.4h[3]
+ smlsl v26.4s, v15.4h, v7.4h[1]
+ smlal v28.4s, v15.4h, v6.4h[3]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlal v20.4s, v12.4h, v7.4h[0]
+ smlal v20.4s, v13.4h, v7.4h[2]
+ smlsl v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v6.4h[2]
+ smlal v16.4s, v12.4h, v3.4h[0]
+ smlal v16.4s, v13.4h, v5.4h[2]
+ smlsl v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v4.4h[2]
+
+stage2_shift1:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+
+ st1 { v30.4h, v31.4h},[x0],#16
+ st1 { v12.4h, v13.4h},[x0],#16
+ st1 { v14.4h, v15.4h},[x0],#16
+ st1 { v18.4h, v19.4h},[x0],#16
+
+ mov x1,x4
+
+
+
+
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+ smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlal v20.4s, v11.4h, v4.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlal v22.4s, v11.4h, v5.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v6.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v7.4h[2]
+
+ cmp x12,x11
+ bhs stage2_shift2
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+ smlsl v24.4s, v14.4h, v4.4h[3]
+ smlsl v26.4s, v14.4h, v2.4h[1]
+ smlsl v28.4s, v14.4h, v0.4h[1]
+ smlsl v30.4s, v14.4h, v2.4h[3]
+
+
+ smlsl v24.4s, v15.4h, v0.4h[3]
+ smlsl v26.4s, v15.4h, v3.4h[1]
+ smlsl v28.4s, v15.4h, v6.4h[3]
+ smlal v30.4s, v15.4h, v5.4h[3]
+
+
+ smlsl v20.4s, v12.4h, v7.4h[0]
+ smlsl v20.4s, v13.4h, v2.4h[2]
+ smlsl v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v0.4h[2]
+ smlsl v16.4s, v12.4h, v3.4h[0]
+ smlsl v16.4s, v13.4h, v3.4h[2]
+ smlsl v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v6.4h[2]
+
+ cmp x12,x5
+ bhs stage2_shift2
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+
+
+
+ smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v2.4h[0]
+ smlsl v20.4s, v11.4h, v6.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v6.4h[0]
+ smlal v22.4s, v11.4h, v4.4h[2]
+
+ smlal v16.4s, v10.4h, v6.4h[0]
+ smlal v16.4s, v11.4h, v0.4h[2]
+
+ smlal v18.4s, v10.4h, v2.4h[0]
+ smlal v18.4s, v11.4h, v5.4h[2]
+
+ cmp x12,x6
+ bhs stage2_shift2
+
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+ smlal v24.4s, v14.4h, v2.4h[3]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlsl v28.4s, v14.4h, v5.4h[3]
+ smlsl v30.4s, v14.4h, v0.4h[3]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlsl v26.4s, v15.4h, v6.4h[3]
+ smlsl v28.4s, v15.4h, v0.4h[3]
+ smlal v30.4s, v15.4h, v7.4h[3]
+
+
+ smlal v20.4s, v12.4h, v5.4h[0]
+ smlal v20.4s, v13.4h, v0.4h[2]
+ smlal v22.4s, v12.4h, v1.4h[0]
+ smlal v22.4s, v13.4h, v6.4h[2]
+ smlal v16.4s, v12.4h, v7.4h[0]
+ smlsl v16.4s, v13.4h, v2.4h[2]
+ smlsl v18.4s, v12.4h, v3.4h[0]
+ smlsl v18.4s, v13.4h, v4.4h[2]
+
+ cmp x12,x9
+ bhs stage2_shift2
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+
+ smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v7.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v1.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v5.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v3.4h[2]
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+ smlsl v24.4s, v14.4h, v0.4h[1]
+ smlal v26.4s, v14.4h, v6.4h[1]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlsl v30.4s, v14.4h, v1.4h[1]
+
+
+ smlsl v24.4s, v15.4h, v3.4h[3]
+ smlal v26.4s, v15.4h, v0.4h[1]
+ smlsl v28.4s, v15.4h, v5.4h[1]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v3.4h[0]
+ smlsl v20.4s, v13.4h, v1.4h[2]
+ smlsl v22.4s, v12.4h, v7.4h[0]
+ smlal v22.4s, v13.4h, v3.4h[2]
+ smlal v16.4s, v12.4h, v1.4h[0]
+ smlal v16.4s, v13.4h, v7.4h[2]
+ smlsl v18.4s, v12.4h, v5.4h[0]
+ smlsl v18.4s, v13.4h, v2.4h[2]
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+ smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v6.4h[0]
+ smlal v20.4s, v11.4h, v5.4h[2]
+
+
+ smlal v22.4s, v10.4h, v2.4h[0]
+ smlal v22.4s, v11.4h, v7.4h[2]
+
+ smlsl v16.4s, v10.4h, v2.4h[0]
+ smlsl v16.4s, v11.4h, v4.4h[2]
+
+ smlal v18.4s, v10.4h, v6.4h[0]
+ smlal v18.4s, v11.4h, v1.4h[2]
+
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+ smlal v24.4s, v14.4h, v1.4h[1]
+ smlsl v26.4s, v14.4h, v0.4h[3]
+ smlal v28.4s, v14.4h, v1.4h[3]
+ smlsl v30.4s, v14.4h, v3.4h[1]
+
+
+ smlal v24.4s, v15.4h, v5.4h[3]
+ smlsl v26.4s, v15.4h, v5.4h[1]
+ smlal v28.4s, v15.4h, v4.4h[3]
+ smlsl v30.4s, v15.4h, v4.4h[1]
+
+
+ smlal v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v3.4h[2]
+ smlsl v22.4s, v12.4h, v3.4h[0]
+ smlsl v22.4s, v13.4h, v2.4h[2]
+ smlal v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v1.4h[2]
+ smlsl v18.4s, v12.4h, v7.4h[0]
+ smlsl v18.4s, v13.4h, v0.4h[2]
+
+stage2_shift2:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+
+ st1 { v30.4h, v31.4h},[x0],#16
+ st1 { v12.4h, v13.4h},[x0],#16
+ st1 { v14.4h, v15.4h},[x0],#16
+ st1 { v18.4h, v19.4h},[x0],#16
+
+
+ mov x1,x4
+
+
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+ smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v7.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v6.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v5.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v4.4h[2]
+
+ cmp x12,x11
+ bhs stage2_shift3
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+ smlsl v24.4s, v14.4h, v5.4h[1]
+ smlsl v26.4s, v14.4h, v7.4h[3]
+ smlal v28.4s, v14.4h, v5.4h[3]
+ smlal v30.4s, v14.4h, v3.4h[1]
+
+
+ smlal v24.4s, v15.4h, v2.4h[1]
+ smlal v26.4s, v15.4h, v1.4h[1]
+ smlal v28.4s, v15.4h, v4.4h[3]
+ smlsl v30.4s, v15.4h, v7.4h[3]
+
+
+ smlsl v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v6.4h[2]
+ smlsl v22.4s, v12.4h, v3.4h[0]
+ smlal v22.4s, v13.4h, v3.4h[2]
+ smlsl v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v0.4h[2]
+ smlsl v18.4s, v12.4h, v7.4h[0]
+ smlal v18.4s, v13.4h, v2.4h[2]
+
+ cmp x12,x5
+ bhs stage2_shift3
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+
+ smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v2.4h[0]
+ smlsl v20.4s, v11.4h, v5.4h[2]
+
+
+ smlal v22.4s, v10.4h, v6.4h[0]
+ smlsl v22.4s, v11.4h, v0.4h[2]
+
+ smlsl v16.4s, v10.4h, v6.4h[0]
+ smlsl v16.4s, v11.4h, v4.4h[2]
+
+ smlsl v18.4s, v10.4h, v2.4h[0]
+ smlal v18.4s, v11.4h, v6.4h[2]
+
+ cmp x12,x6
+ bhs stage2_shift3
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+ smlsl v24.4s, v14.4h, v7.4h[1]
+ smlal v26.4s, v14.4h, v2.4h[1]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlsl v30.4s, v14.4h, v5.4h[1]
+
+
+ smlal v24.4s, v15.4h, v0.4h[3]
+ smlal v26.4s, v15.4h, v7.4h[1]
+ smlsl v28.4s, v15.4h, v1.4h[1]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v3.4h[0]
+ smlal v20.4s, v13.4h, v4.4h[2]
+ smlal v22.4s, v12.4h, v7.4h[0]
+ smlal v22.4s, v13.4h, v2.4h[2]
+ smlal v16.4s, v12.4h, v1.4h[0]
+ smlsl v16.4s, v13.4h, v6.4h[2]
+ smlal v18.4s, v12.4h, v5.4h[0]
+ smlsl v18.4s, v13.4h, v0.4h[2]
+
+ cmp x12,x9
+ bhs stage2_shift3
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+ smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v5.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v1.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlal v18.4s, v11.4h, v7.4h[2]
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+ smlal v24.4s, v14.4h, v6.4h[3]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlsl v28.4s, v14.4h, v1.4h[3]
+ smlal v30.4s, v14.4h, v7.4h[1]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlsl v26.4s, v15.4h, v2.4h[3]
+ smlal v28.4s, v15.4h, v7.4h[1]
+ smlal v30.4s, v15.4h, v4.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v5.4h[0]
+ smlal v20.4s, v13.4h, v2.4h[2]
+ smlal v22.4s, v12.4h, v1.4h[0]
+ smlsl v22.4s, v13.4h, v7.4h[2]
+ smlsl v16.4s, v12.4h, v7.4h[0]
+ smlsl v16.4s, v13.4h, v3.4h[2]
+ smlsl v18.4s, v12.4h, v3.4h[0]
+ smlal v18.4s, v13.4h, v1.4h[2]
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+ smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
+ smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v6.4h[0]
+ smlsl v20.4s, v11.4h, v1.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v2.4h[0]
+ smlal v22.4s, v11.4h, v4.4h[2]
+
+ smlal v16.4s, v10.4h, v2.4h[0]
+ smlsl v16.4s, v11.4h, v7.4h[2]
+
+ smlsl v18.4s, v10.4h, v6.4h[0]
+ smlsl v18.4s, v11.4h, v5.4h[2]
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+ smlal v24.4s, v14.4h, v4.4h[3]
+ smlsl v26.4s, v14.4h, v6.4h[1]
+ smlal v28.4s, v14.4h, v7.4h[3]
+ smlal v30.4s, v14.4h, v6.4h[3]
+
+
+ smlal v24.4s, v15.4h, v3.4h[3]
+ smlsl v26.4s, v15.4h, v3.4h[1]
+ smlal v28.4s, v15.4h, v2.4h[3]
+ smlsl v30.4s, v15.4h, v2.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v7.4h[0]
+ smlal v20.4s, v13.4h, v0.4h[2]
+ smlal v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v1.4h[2]
+ smlsl v16.4s, v12.4h, v3.4h[0]
+ smlal v16.4s, v13.4h, v2.4h[2]
+ smlal v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v3.4h[2]
+
+stage2_shift3:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+
+ st1 { v30.4h, v31.4h},[x0],#16
+ st1 { v12.4h, v13.4h},[x0],#16
+ st1 { v14.4h, v15.4h},[x0],#16
+ st1 { v18.4h, v19.4h},[x0],#16
+
+
+
+ mov x1,x4
+
+
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+ smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smull v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v3.4h[2]
+
+
+ smull v22.4s, v10.4h, v0.4h[0]
+ smlsl v22.4s, v11.4h, v2.4h[2]
+
+ smull v16.4s, v10.4h, v0.4h[0]
+ smlsl v16.4s, v11.4h, v1.4h[2]
+
+ smull v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v0.4h[2]
+
+ cmp x12,x11
+ bhs stage2_shift4
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+ smlal v24.4s, v14.4h, v0.4h[1]
+ smlal v26.4s, v14.4h, v1.4h[3]
+ smlal v28.4s, v14.4h, v4.4h[1]
+ smlal v30.4s, v14.4h, v6.4h[3]
+
+
+ smlsl v24.4s, v15.4h, v4.4h[1]
+ smlsl v26.4s, v15.4h, v0.4h[3]
+ smlsl v28.4s, v15.4h, v2.4h[3]
+ smlsl v30.4s, v15.4h, v6.4h[1]
+
+
+ smlal v20.4s, v12.4h, v7.4h[0]
+ smlal v20.4s, v13.4h, v5.4h[2]
+ smlal v22.4s, v12.4h, v5.4h[0]
+ smlsl v22.4s, v13.4h, v7.4h[2]
+ smlal v16.4s, v12.4h, v3.4h[0]
+ smlsl v16.4s, v13.4h, v4.4h[2]
+ smlal v18.4s, v12.4h, v1.4h[0]
+ smlsl v18.4s, v13.4h, v1.4h[2]
+
+ cmp x12,x5
+ bhs stage2_shift4
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+
+ smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
+ smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v2.4h[0]
+ smlal v20.4s, v11.4h, v1.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v6.4h[0]
+ smlal v22.4s, v11.4h, v3.4h[2]
+
+ smlal v16.4s, v10.4h, v6.4h[0]
+ smlsl v16.4s, v11.4h, v7.4h[2]
+
+ smlal v18.4s, v10.4h, v2.4h[0]
+ smlsl v18.4s, v11.4h, v2.4h[2]
+
+ cmp x12,x6
+ bhs stage2_shift4
+
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+
+
+ smlsl v24.4s, v14.4h, v1.4h[1]
+ smlsl v26.4s, v14.4h, v7.4h[3]
+ smlal v28.4s, v14.4h, v1.4h[3]
+ smlal v30.4s, v14.4h, v4.4h[3]
+
+
+ smlal v24.4s, v15.4h, v2.4h[1]
+ smlal v26.4s, v15.4h, v5.4h[1]
+ smlsl v28.4s, v15.4h, v3.4h[1]
+ smlsl v30.4s, v15.4h, v4.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v5.4h[0]
+ smlsl v20.4s, v13.4h, v7.4h[2]
+ smlsl v22.4s, v12.4h, v1.4h[0]
+ smlal v22.4s, v13.4h, v1.4h[2]
+ smlsl v16.4s, v12.4h, v7.4h[0]
+ smlal v16.4s, v13.4h, v5.4h[2]
+ smlal v18.4s, v12.4h, v3.4h[0]
+ smlsl v18.4s, v13.4h, v3.4h[2]
+
+ cmp x12,x9
+ bhs stage2_shift4
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+ smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
+ smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlal v20.4s, v10.4h, v0.4h[0]
+ smlsl v20.4s, v11.4h, v0.4h[2]
+
+
+ smlsl v22.4s, v10.4h, v0.4h[0]
+ smlal v22.4s, v11.4h, v6.4h[2]
+
+ smlsl v16.4s, v10.4h, v0.4h[0]
+ smlal v16.4s, v11.4h, v2.4h[2]
+
+ smlal v18.4s, v10.4h, v0.4h[0]
+ smlsl v18.4s, v11.4h, v4.4h[2]
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+
+ smlal v24.4s, v14.4h, v3.4h[1]
+ smlsl v26.4s, v14.4h, v2.4h[1]
+ smlal v28.4s, v14.4h, v7.4h[3]
+ smlal v30.4s, v14.4h, v2.4h[3]
+
+
+ smlsl v24.4s, v15.4h, v0.4h[3]
+ smlal v26.4s, v15.4h, v4.4h[3]
+ smlal v28.4s, v15.4h, v6.4h[3]
+ smlsl v30.4s, v15.4h, v2.4h[1]
+
+
+ smlal v20.4s, v12.4h, v3.4h[0]
+ smlsl v20.4s, v13.4h, v6.4h[2]
+ smlal v22.4s, v12.4h, v7.4h[0]
+ smlsl v22.4s, v13.4h, v4.4h[2]
+ smlsl v16.4s, v12.4h, v1.4h[0]
+ smlal v16.4s, v13.4h, v0.4h[2]
+ smlal v18.4s, v12.4h, v5.4h[0]
+ smlsl v18.4s, v13.4h, v5.4h[2]
+
+
+ ld1 {v10.4h, v11.4h},[x1],#16
+ ld1 {v8.4h, v9.4h},[x1],x10
+
+
+
+
+ smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0)
+ smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
+ smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
+ smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+
+
+
+
+ smlsl v20.4s, v10.4h, v6.4h[0]
+ smlal v20.4s, v11.4h, v2.4h[2]
+
+
+ smlal v22.4s, v10.4h, v2.4h[0]
+ smlsl v22.4s, v11.4h, v0.4h[2]
+
+ smlsl v16.4s, v10.4h, v2.4h[0]
+ smlal v16.4s, v11.4h, v3.4h[2]
+
+ smlal v18.4s, v10.4h, v6.4h[0]
+ smlsl v18.4s, v11.4h, v6.4h[2]
+
+
+ ld1 {v12.4h, v13.4h},[x1],#16
+ ld1 {v14.4h, v15.4h},[x1],x10
+
+
+
+ smlsl v24.4s, v14.4h, v5.4h[1]
+ smlal v26.4s, v14.4h, v3.4h[3]
+ smlsl v28.4s, v14.4h, v2.4h[1]
+ smlal v30.4s, v14.4h, v0.4h[3]
+
+
+ smlal v24.4s, v15.4h, v1.4h[3]
+ smlsl v26.4s, v15.4h, v1.4h[1]
+ smlal v28.4s, v15.4h, v0.4h[3]
+ smlsl v30.4s, v15.4h, v0.4h[1]
+
+
+ smlsl v20.4s, v12.4h, v1.4h[0]
+ smlal v20.4s, v13.4h, v4.4h[2]
+ smlal v22.4s, v12.4h, v3.4h[0]
+ smlsl v22.4s, v13.4h, v5.4h[2]
+ smlsl v16.4s, v12.4h, v5.4h[0]
+ smlal v16.4s, v13.4h, v6.4h[2]
+ smlal v18.4s, v12.4h, v7.4h[0]
+ smlsl v18.4s, v13.4h, v7.4h[2]
+
+stage2_shift4:
+ add v8.4s, v20.4s , v24.4s
+ sub v10.4s, v20.4s , v24.4s
+
+ add v12.4s, v22.4s , v26.4s
+ sub v24.4s, v22.4s , v26.4s
+
+ add v14.4s, v16.4s , v28.4s
+ sub v26.4s, v16.4s , v28.4s
+
+
+ add v16.4s, v18.4s , v30.4s
+ sub v28.4s, v18.4s , v30.4s
+
+
+ sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
+ sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
+
+
+
+ umov x15,v24.d[0]
+ umov x16,v25.d[0]
+ umov x19,v26.d[0]
+ umov x20,v27.d[0]
+
+ trn1 v24.4h, v30.4h, v12.4h
+ trn2 v25.4h, v30.4h, v12.4h
+ trn1 v26.4h, v31.4h, v13.4h
+ trn2 v27.4h, v31.4h, v13.4h
+
+ trn1 v30.2s, v24.2s, v26.2s
+ trn2 v31.2s, v24.2s, v26.2s
+ trn1 v12.2s, v25.2s, v27.2s
+ trn2 v13.2s, v25.2s, v27.2s
+
+ trn1 v24.4h, v14.4h, v18.4h
+ trn2 v25.4h, v14.4h, v18.4h
+ trn1 v26.4h, v15.4h, v19.4h
+ trn2 v27.4h, v15.4h, v19.4h
+
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v15.2s, v24.2s, v26.2s
+ trn1 v18.2s, v25.2s, v27.2s
+ trn2 v19.2s, v25.2s, v27.2s
+
+ mov v24.d[0],x15
+ mov v25.d[0],x16
+ mov v26.d[0],x19
+ mov v27.d[0],x20
+
+ st1 { v30.4h, v31.4h},[x0],#16
+ st1 { v12.4h, v13.4h},[x0],#16
+ st1 { v14.4h, v15.4h},[x0],#16
+ st1 { v18.4h, v19.4h},[x0],#16
+
+
+
+
+ sub x0,x0,#256
+prediction_buffer:
+
+
+ ld1 {v12.8h},[x0],#16
+ ld1 {v14.8h},[x0],#16
+
+ add x0,x0,#32
+
+ ld1 {v16.8h},[x0],#16
+ ld1 {v18.8h},[x0],#16
+ add x0,x0,#32
+
+ ld1 {v20.8h},[x0],#16
+ ld1 {v22.8h},[x0],#16
+
+
+ add x0,x0,#32
+
+ ld1 {v24.8h},[x0],#16
+ ld1 {v26.8h},[x0],#16
+
+
+
+
+
+// d12 =x0 1- 4 values
+// d13 =x2 1- 4 values
+// d14=x1 1- 4 values
+// d15=x3 1- 4 values
+
+// d16 =x0 5- 8 values
+// d17 =x2 5- 8 values
+// d18=x1 5- 8 values
+// d19=x3 5- 8 values
+
+// d20 =x0 9- 12 values
+// d21 =x2 9- 12 values
+// d22=x1 9- 12 values
+// d23=x3 9- 12 values
+
+// d24 =x0 13-16 values
+// d25 =x2 13- 16 values
+// d26=x1 13- 16 values
+// d27=x3 13- 16 values
+
+ // swapping v12 upper and v16 lower 64bits
+ mov v13.d[0], v12.d[1]
+ mov v12.d[1], v16.d[0]
+ mov v16.d[0], v13.d[0]
+ // swapping v20 upper and v24 lower 64bits
+ mov v21.d[0], v20.d[1]
+ mov v20.d[1], v24.d[0]
+ mov v24.d[0], v21.d[0]
+ // swapping v14 uppper and v18 lower 64bits
+ mov v15.d[0], v14.d[1]
+ mov v14.d[1], v18.d[0]
+ mov v18.d[0], v15.d[0]
+ // swapping v22 upper and v26 lower 64bits
+ mov v23.d[0], v22.d[1]
+ mov v22.d[1], v26.d[0]
+ mov v26.d[0], v23.d[0]
+
+
+ ld1 {v8.8b, v9.8b},[x2],x8
+ ld1 {v10.8b, v11.8b},[x2],x8
+ ld1 {v28.8b, v29.8b},[x2],x8
+ ld1 {v30.8b, v31.8b},[x2],x8
+
+
+ uaddw v12.8h, v12.8h , v8.8b
+ uaddw v20.8h, v20.8h , v9.8b
+ uaddw v14.8h, v14.8h , v10.8b
+ uaddw v22.8h, v22.8h , v11.8b
+ uaddw v16.8h, v16.8h , v28.8b
+ uaddw v24.8h, v24.8h , v29.8b
+ uaddw v18.8h, v18.8h , v30.8b
+ uaddw v26.8h, v26.8h , v31.8b
+ sub x2,x2,x8,lsl #2
+ add x2,x2,#16
+ sqxtun v12.8b, v12.8h
+ sqxtun v13.8b, v20.8h
+ sqxtun v20.8b, v14.8h
+ sqxtun v21.8b, v22.8h
+ sqxtun v14.8b, v16.8h
+ sqxtun v15.8b, v24.8h
+ sqxtun v22.8b, v18.8h
+ sqxtun v23.8b, v26.8h
+
+
+ st1 {v12.8b, v13.8b},[x3],x7
+ st1 {v20.8b, v21.8b},[x3],x7
+ st1 {v14.8b, v15.8b},[x3],x7
+ st1 {v22.8b, v23.8b},[x3],x7
+
+
+ sub x3,x3,x7,lsl #2
+ add x3,x3,#16
+
+ ld1 {v12.8h},[x0],#16
+ ld1 {v14.8h},[x0],#16
+
+ sub x0,x0,#96
+
+ ld1 {v16.8h},[x0],#16
+ ld1 {v18.8h},[x0],#16
+ sub x0,x0,#96
+
+ ld1 {v20.8h},[x0],#16
+ ld1 {v22.8h},[x0],#16
+
+
+ sub x0,x0,#96
+
+ ld1 {v24.8h},[x0],#16
+ ld1 {v26.8h},[x0],#16
+
+
+ sub x0,x0,#64
+
+
+ // swapping v12 upper and v16 lower 64bits
+ mov v13.d[0], v12.d[1]
+ mov v12.d[1], v16.d[0]
+ mov v16.d[0], v13.d[0]
+ // swapping v20 upper and v24 lower 64bits
+ mov v21.d[0], v20.d[1]
+ mov v20.d[1], v24.d[0]
+ mov v24.d[0], v21.d[0]
+ // swapping v14 uppper and v18 lower 64bits
+ mov v15.d[0], v14.d[1]
+ mov v14.d[1], v18.d[0]
+ mov v18.d[0], v15.d[0]
+ // swapping v22 upper and v26 lower 64bits
+ mov v23.d[0], v22.d[1]
+ mov v22.d[1], v26.d[0]
+ mov v26.d[0], v23.d[0]
+
+
+ ld1 {v8.8b, v9.8b},[x2],x8
+ ld1 {v10.8b, v11.8b},[x2],x8
+ ld1 {v28.8b, v29.8b},[x2],x8
+ ld1 {v30.8b, v31.8b},[x2],x8
+
+
+ uaddw v12.8h, v12.8h , v8.8b
+ uaddw v20.8h, v20.8h , v9.8b
+ uaddw v14.8h, v14.8h , v10.8b
+ uaddw v22.8h, v22.8h , v11.8b
+ uaddw v16.8h, v16.8h , v28.8b
+ uaddw v24.8h, v24.8h , v29.8b
+ uaddw v18.8h, v18.8h , v30.8b
+ uaddw v26.8h, v26.8h , v31.8b
+ sub x2,x2,#16
+
+ sqxtun v12.8b, v12.8h
+ sqxtun v13.8b, v20.8h
+ sqxtun v20.8b, v14.8h
+ sqxtun v21.8b, v22.8h
+ sqxtun v14.8b, v16.8h
+ sqxtun v15.8b, v24.8h
+ sqxtun v22.8b, v18.8h
+ sqxtun v23.8b, v26.8h
+
+
+ st1 {v12.8b, v13.8b},[x3],x7
+ st1 {v20.8b, v21.8b},[x3],x7
+ st1 {v14.8b, v15.8b},[x3],x7
+ st1 {v22.8b, v23.8b},[x3],x7
+
+ sub x3,x3,#16
+
+ subs x14,x14,#1
+ bne dct_stage2
+ // ldmfd sp!,{x0-x12,pc}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
diff --git a/common/arm64/ihevc_itrans_recon_4x4.s b/common/arm64/ihevc_itrans_recon_4x4.s
new file mode 100644
index 0000000..b18fb89
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_4x4.s
@@ -0,0 +1,237 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// * ihevc_itrans_recon_4x4_neon.s
+// *
+// * @brief
+// * contains function definitions for single stage inverse transform
+// *
+// * @author
+// * naveen sr
+// *
+// * @par list of functions:
+// * - ihevc_itrans_recon_4x4()
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+//*/
+// /**
+// *******************************************************************************
+// *
+// * @brief
+// * this function performs inverse transform and reconstruction for 4x4
+// * input block
+// *
+// * @par description:
+// * performs inverse transform and adds the prediction data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// * input 4x4 coefficients
+// *
+// * @param[in] pi2_tmp
+// * temporary 4x4 buffer for storing inverse
+// *
+// * transform
+// * 1st stage output
+// *
+// * @param[in] pu1_pred
+// * prediction 4x4 block
+// *
+// * @param[out] pu1_dst
+// * output 4x4 block
+// *
+// * @param[in] src_strd
+// * input stride
+// *
+// * @param[in] pred_strd
+// * prediction stride
+// *
+// * @param[in] dst_strd
+// * output stride
+// *
+// * @param[in] shift
+// * output shift
+// *
+// * @param[in] zero_cols
+// * zero columns in pi2_src
+// *
+// * @returns void
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+//void ihevc_itrans_recon_4x4(word16 *pi2_src,
+// word16 *pi2_tmp,
+// uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 zero_cols)
+//**************variables vs registers*************************
+// x0 => *pi2_src
+// x1 => *pi2_tmp
+// x2 => *pu1_pred
+// x3 => *pu1_dst
+// x4 => src_strd
+// x5 => pred_strd
+// x6 => dst_strd
+// x7 => zero_cols
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+
+
+.globl ihevc_itrans_recon_4x4_av8
+
+.extern g_ai2_ihevc_trans_4_transpose
+
+.type ihevc_itrans_recon_4x4_av8, %function
+
+ihevc_itrans_recon_4x4_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ adrp x8, :got:g_ai2_ihevc_trans_4_transpose
+ ldr x8, [x8, #:got_lo12:g_ai2_ihevc_trans_4_transpose]
+
+ add x4,x4,x4 // src_strd in terms of word16
+ add x9,x0,x4 // pi2_src[0] + src_strd
+
+ ld1 {v4.4h},[x8] //loading first row of g_ai2_ihevc_trans_4_transpose
+ // d4 = {36,64,83,64}
+ //index = 3 2 1 0
+ add x10,x9,x4, lsl #1 // 3*src_strd
+ add x4,x4,x4
+ ld1 {v1.4h},[x9] //loading pi2_src 2nd row
+ ld1 {v3.4h},[x10] //loading pi2_src 4th row
+ ld1 {v0.4h},[x0],x4 //loading pi2_src 1st row
+ ld1 {v2.4h},[x0],x4 //loading pi2_src 3rd row
+
+
+ // first stage computation starts
+ smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1]
+ smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+ smull v8.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
+ ld1 {v22.s}[0],[x2],x5
+ smlsl v8.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+
+ saddl v10.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
+ ssubl v12.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2]
+ shl v10.4s, v10.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2])
+ shl v12.4s, v12.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+ add v14.4s, v10.4s , v6.4s //((e[0] + o[0] )
+ add v16.4s, v12.4s , v8.4s //((e[1] + o[1])
+ sub v18.4s, v12.4s , v8.4s //((e[1] - o[1])
+ sub v20.4s, v10.4s , v6.4s //((e[0] - o[0])
+
+ sqrshrn v28.4h, v14.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+ sqrshrn v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+ sqrshrn v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+ sqrshrn v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+
+ trn1 v24.4h, v28.4h, v29.4h
+ trn2 v25.4h, v28.4h, v29.4h
+ trn1 v26.4h, v30.4h, v31.4h
+ trn2 v27.4h, v30.4h, v31.4h
+ trn1 v0.2s, v24.2s, v26.2s
+ trn2 v2.2s, v24.2s, v26.2s
+ trn1 v1.2s, v25.2s, v27.2s
+ trn2 v3.2s, v25.2s, v27.2s
+
+ // first stage ends
+ // output in d0,d1,d2,d3
+ // second stage starts
+ smull v6.4s, v1.4h, v4.4h[1] //83 * pi2_src[1]
+ ld1 {v22.s}[1],[x2],x5
+ smlal v6.4s, v3.4h, v4.4h[3] //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
+ smull v8.4s, v1.4h, v4.4h[3] //36 * pi2_src[1]
+ smlsl v8.4s, v3.4h, v4.4h[1] //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
+ ld1 {v23.s}[0],[x2],x5
+
+ saddl v10.4s, v0.4h, v2.4h //pi2_src[0] + pi2_src[2]
+ ssubl v12.4s, v0.4h, v2.4h //pi2_src[0] - pi2_src[2]
+ shl v10.4s, v10.4s,#6 //e[0] = 64*(pi2_src[0] + pi2_src[2])
+ shl v12.4s, v12.4s,#6 //e[1] = 64*(pi2_src[0] - pi2_src[2])
+
+
+ add v14.4s, v10.4s , v6.4s //((e[0] + o[0] )
+ add v16.4s, v12.4s , v8.4s //((e[1] + o[1])
+ sub v18.4s, v12.4s , v8.4s //((e[1] - o[1])
+ sub v20.4s, v10.4s , v6.4s //((e[0] - o[0])
+
+ sqrshrn v28.4h, v14.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
+ sqrshrn v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
+ sqrshrn v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
+ sqrshrn v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
+ ld1 {v23.s}[1],[x2],x5
+
+ trn1 v24.4h, v28.4h, v29.4h
+ trn2 v25.4h, v28.4h, v29.4h
+ trn1 v26.4h, v30.4h, v31.4h
+ trn2 v27.4h, v30.4h, v31.4h
+ trn1 v0.2s, v24.2s, v26.2s
+ trn2 v2.2s, v24.2s, v26.2s
+ trn1 v1.2s, v25.2s, v27.2s
+ trn2 v3.2s, v25.2s, v27.2s
+ // second stage ends
+ // output in d0,d1,d2,d3
+ // second stage computation ends
+
+ // loading pred
+
+ mov v0.d[1],v1.d[0]
+ mov v2.d[1],v3.d[0]
+
+ uaddw v0.8h, v0.8h , v22.8b // pi2_out(16bit) + pu1_pred(8bit)
+ uaddw v2.8h, v2.8h , v23.8b // pi2_out(16bit) + pu1_pred(8bit)
+ sqxtun v0.8b, v0.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+ sqxtun v1.8b, v2.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+ // storing destination
+ st1 {v0.s}[0],[x3],x6
+ st1 {v0.s}[1],[x3],x6
+ st1 {v1.s}[0],[x3],x6
+ st1 {v1.s}[1],[x3],x6
+
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
diff --git a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
new file mode 100644
index 0000000..fa04b8e
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
@@ -0,0 +1,246 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// * ihevc_itrans_recon_4x4_ttype1.s
+// *
+// * @brief
+// * contains function definitions for inverse transform and reconstruction
+// *
+// *
+// * @author
+// * naveen sr
+// *
+// * @par list of functions:
+// * - ihevc_itrans_recon_4x4_ttype1()
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+
+///* all the functions here are replicated from ihevc_itrans.c and modified to */
+///* include reconstruction */
+//
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * this function performs inverse transform type 1 (dst) and reconstruction
+// * for 4x4 input block
+// *
+// * @par description:
+// * performs inverse transform and adds the prediction data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// * input 4x4 coefficients
+// *
+// * @param[in] pi2_tmp
+// * temporary 4x4 buffer for storing inverse
+// *
+// * transform
+// * 1st stage output
+// *
+// * @param[in] pu1_pred
+// * prediction 4x4 block
+// *
+// * @param[out] pu1_dst
+// * output 4x4 block
+// *
+// * @param[in] src_strd
+// * input stride
+// *
+// * @param[in] pred_strd
+// * prediction stride
+// *
+// * @param[in] dst_strd
+// * output stride
+// *
+// * @param[in] zero_cols
+// * zero columns in pi2_src
+// *
+// * @returns void
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+//void ihevc_itrans_recon_4x4_ttype1(word16 *pi2_src,
+// word16 *pi2_tmp,
+// uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 zero_cols)
+
+//**************variables vs registers*************************
+// x0 => *pi2_src
+// x1 => *pi2_tmp
+// x2 => *pu1_pred
+// x3 => *pu1_dst
+// x4 => src_strd
+// x5 => pred_strd
+// x6 => dst_strd
+// x7 => zero_cols
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+.globl ihevc_itrans_recon_4x4_ttype1_av8
+
+.type ihevc_itrans_recon_4x4_ttype1_av8, %function
+
+ihevc_itrans_recon_4x4_ttype1_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ add x4,x4,x4 // src_strd in terms of word16
+
+ mov x8,#29
+ mov x9,#55
+ mov x10,#74
+ mov x11,#84
+ mov v4.4h[0], w8
+ ld1 {v0.4h},[x0],x4 //loading pi2_src 1st row
+ mov v4.4h[1], w9
+ ld1 {v1.4h},[x0],x4 //loading pi2_src 2nd row
+ mov v4.4h[2], w10
+ ld1 {v2.4h},[x0],x4 //loading pi2_src 3rd row
+ mov v4.4h[3], w11
+ ld1 {v3.4h},[x0],x4 //loading pi2_src 4th row
+
+ // first stage computation starts
+ smull v6.4s, v1.4h, v4.4h[2] //74 * pi2_src[1]
+ smlal v6.4s, v0.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
+ smlal v6.4s, v3.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+ smlal v6.4s, v2.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+ smull v8.4s, v1.4h, v4.4h[2] //74 * pi2_src[1]
+ smlal v8.4s, v0.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
+ smlsl v8.4s, v2.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ smlsl v8.4s, v3.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+
+ smull v10.4s, v0.4h, v4.4h[2] // 74 * pi2_src[0]
+ smlsl v10.4s, v2.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
+ smlal v10.4s, v3.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+
+ smull v12.4s, v2.4h, v4.4h[1] // 55 * pi2_src[2]
+ smlsl v12.4s, v1.4h, v4.4h[2] // 55 * pi2_src[2] - 74 * pi2_src[1]
+ smlsl v12.4s, v3.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smlal v12.4s, v0.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+ sqrshrn v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
+ sqrshrn v29.4h, v8.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+ sqrshrn v30.4h, v10.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+ sqrshrn v31.4h, v12.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+ ld1 {v18.s}[0],[x2],x5
+
+ trn1 v24.4h, v28.4h, v29.4h
+ trn2 v25.4h, v28.4h, v29.4h
+ trn1 v26.4h, v30.4h, v31.4h
+ trn2 v27.4h, v30.4h, v31.4h
+ trn1 v14.2s, v24.2s, v26.2s
+ trn2 v16.2s, v24.2s, v26.2s
+ trn1 v15.2s, v25.2s, v27.2s
+ trn2 v17.2s, v25.2s, v27.2s
+ // output in d14,d15,d16,d17
+ // first stage computation ends
+
+ // second stage computation starts : copy pasting 1st stage
+ // register changes
+ // d14 - d0
+ // d15 - d1
+ // d16 - d2
+ // d17 - d3
+ ld1 {v18.s}[1],[x2],x5
+ smull v6.4s, v15.4h, v4.4h[2] //74 * pi2_src[1]
+ smlal v6.4s, v14.4h, v4.4h[0] //74 * pi2_src[1] + 29 * pi2_src[0]
+ smlal v6.4s, v17.4h, v4.4h[1] //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
+ smlal v6.4s, v16.4h, v4.4h[3] //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
+
+ smull v8.4s, v15.4h, v4.4h[2] //74 * pi2_src[1]
+ smlal v8.4s, v14.4h, v4.4h[1] //74 * pi2_src[1] + 55 * pi2_src[0]
+ smlsl v8.4s, v16.4h, v4.4h[0] //74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2]
+ smlsl v8.4s, v17.4h, v4.4h[3] //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] - 29 * pi2_src[2] - 84 * pi2_src[3])
+
+ smull v10.4s, v14.4h, v4.4h[2] // 74 * pi2_src[0]
+ smlsl v10.4s, v16.4h, v4.4h[2] // 74 * pi2_src[0] - 74 * pi2_src[2]
+ smlal v10.4s, v17.4h, v4.4h[2] //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
+ ld1 {v19.s}[0],[x2],x5
+
+ smull v12.4s, v16.4h, v4.4h[1] // 55 * pi2_src[2]
+ smlsl v12.4s, v15.4h, v4.4h[2] // - 74 * pi2_src[1] + 55 * pi2_src[2]
+ smlsl v12.4s, v17.4h, v4.4h[0] // - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+ smlal v12.4s, v14.4h, v4.4h[3] //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
+
+ sqrshrn v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
+ sqrshrn v29.4h, v8.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
+ sqrshrn v30.4h, v10.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
+ sqrshrn v31.4h, v12.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
+ ld1 {v19.s}[1],[x2],x5
+ trn1 v24.4h, v28.4h, v29.4h
+ trn2 v25.4h, v28.4h, v29.4h
+ trn1 v26.4h, v30.4h, v31.4h
+ trn2 v27.4h, v30.4h, v31.4h
+ trn1 v0.2s, v24.2s, v26.2s
+ trn2 v2.2s, v24.2s, v26.2s
+ trn1 v1.2s, v25.2s, v27.2s
+ trn2 v3.2s, v25.2s, v27.2s
+ // output in d0,d1,d2,d3
+ // second stage computation ends
+
+ // loading pred
+ mov v0.d[1],v1.d[0]
+ mov v2.d[1],v3.d[0]
+
+ uaddw v0.8h, v0.8h , v18.8b // pi2_out(16bit) + pu1_pred(8bit)
+ sqxtun v0.8b, v0.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+ uaddw v2.8h, v2.8h , v19.8b // pi2_out(16bit) + pu1_pred(8bit)
+ sqxtun v1.8b, v2.8h // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
+
+ // storing destination
+ st1 {v0.s}[0],[x3],x6
+ st1 {v0.s}[1],[x3],x6
+ st1 {v1.s}[0],[x3],x6
+ st1 {v1.s}[1],[x3],x6
+
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_itrans_recon_8x8.s b/common/arm64/ihevc_itrans_recon_8x8.s
new file mode 100644
index 0000000..332677e
--- /dev/null
+++ b/common/arm64/ihevc_itrans_recon_8x8.s
@@ -0,0 +1,1038 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * @file
+// * ihevc_itrans_recon_8x8_neon.s
+// *
+// * @brief
+// * contains function definitions for single stage inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// * - ihevc_itrans_recon_8x8()
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * this function performs inverse transform and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// * performs inverse transform and adds the prediction data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// * input 8x8 coefficients
+// *
+// * @param[in] pi2_tmp
+// * temporary 8x8 buffer for storing inverse
+// *
+// * transform
+// * 1st stage output
+// *
+// * @param[in] pu1_pred
+// * prediction 8x8 block
+// *
+// * @param[out] pu1_dst
+// * output 8x8 block
+// *
+// * @param[in] src_strd
+// * input stride
+// *
+// * @param[in] pred_strd
+// * prediction stride
+// *
+// * @param[in] dst_strd
+// * output stride
+// *
+// * @param[in] shift
+// * output shift
+// *
+// * @param[in] zero_cols
+// * zero columns in pi2_src
+// *
+// * @returns void
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+
+//void ihevc_itrans_recon_8x8(word16 *pi2_src,
+// word16 *pi2_tmp,
+// uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 zero_cols
+// word32 zero_rows )
+
+//**************variables vs registers*************************
+// x0 => *pi2_src
+// x1 => *pi2_tmp
+// x2 => *pu1_pred
+// x3 => *pu1_dst
+// src_strd
+// pred_strd
+// dst_strd
+// zero_cols
+
+
+
+.text
+.align 4
+.include "ihevc_neon_macros.s"
+
+
+
+.set width_x_size_x5 , 40
+.set width_x_size_x2 , 32
+.set shift_stage1_idct , 7
+.set shift_stage2_idct , 12
+
+.globl ihevc_itrans_recon_8x8_av8
+
+.extern g_ai2_ihevc_trans_8_transpose
+
+.type ihevc_itrans_recon_8x8_av8, %function
+
+ihevc_itrans_recon_8x8_av8:
+////register usage.extern - loading and until idct of columns
+//// cosine constants - d0
+//// sine constants - d1
+//// row 0 first half - d2 - y0
+//// row 1 first half - d6 - y1
+//// row 2 first half - d3 - y2
+//// row 3 first half - d7 - y3
+//// row 4 first half - d10 - y4
+//// row 5 first half - d14 - y5
+//// row 6 first half - d11 - y6
+//// row 7 first half - d15 - y7
+
+//// row 0 second half - d4 - y0
+//// row 1 second half - d8 - y1
+//// row 2 second half - d5 - y2
+//// row 3 second half - d9 - y3
+//// row 4 second half - d12 - y4
+//// row 5 second half - d16 - y5
+//// row 6 second half - d13 - y6
+//// row 7 second half - d17 - y7
+
+ //// copy the input pointer to another register
+ //// step 1 : load all constants
+ // stmfd sp!,{x4-x12,x14}
+
+ ldr w11, [sp] // zero rows
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x12, x7 // zero columns
+ mov x8, x5 // prediction stride
+ mov x7, x6 // destination stride
+ mov x6, x4 // src stride
+ lsl x6, x6, #1 // x sizeof(word16)
+ add x9,x0,x6, lsl #1 // 2 rows
+
+ add x10,x6,x6, lsl #1 // 3 rows
+
+ sub x10,x10, #8 // - 4 cols * sizeof(word16)
+ sub x5,x6, #8 // src_strd - 4 cols * sizeof(word16)
+
+ adrp x14, :got:g_ai2_ihevc_trans_8_transpose
+ ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_8_transpose]
+
+ ld1 {v0.4h, v1.4h},[x14] ////d0,d1 are used for storing the constant data
+
+ ////step 2 load all the input data
+ ////step 3 operate first 4 colums at a time
+
+ and x11,x11,#0xff
+ and x12,x12,#0xff
+
+ cmp x11,#0xf0
+ bge skip_last4_rows
+
+
+ ld1 {v2.4h},[x0],#8
+ ld1 {v3.4h},[x9],#8
+ ld1 {v4.4h},[x0],x5
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ ld1 {v5.4h},[x9],x5
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ ld1 {v6.4h},[x0],#8
+ ld1 {v7.4h},[x9],#8
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ ld1 {v8.4h},[x0],x10
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ ld1 {v9.4h},[x9],x10
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ ld1 {v10.4h},[x0],#8
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ ld1 {v11.4h},[x9],#8
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ ld1 {v12.4h},[x0],x5
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ ld1 {v13.4h},[x9],x5
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ ld1 {v14.4h},[x0],#8
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ ld1 {v15.4h},[x9],#8
+ smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ ld1 {v16.4h},[x0],x10
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ ld1 {v17.4h},[x9],x10
+
+ ///* this following was activated when alignment is not there */
+//// vld1.16 d2,[x0]!
+//// vld1.16 d3,[x2]!
+//// vld1.16 d4,[x0]!
+//// vld1.16 d5,[x2]!
+//// vld1.16 d6,[x0]!
+//// vld1.16 d7,[x2]!
+//// vld1.16 d8,[x0],x3
+//// vld1.16 d9,[x2],x3
+//// vld1.16 d10,[x0]!
+//// vld1.16 d11,[x2]!
+//// vld1.16 d12,[x0]!
+//// vld1.16 d13,[x2]!
+//// vld1.16 d14,[x0]!
+//// vld1.16 d15,[x2]!
+//// vld1.16 d16,[x0],x3
+//// vld1.16 d17,[x2],x3
+
+
+
+
+ smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+ add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+ sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+ add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
+ sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
+
+ add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
+ sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
+
+ add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
+ sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
+
+ add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
+ sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
+
+ sqrshrn v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+ b last4_cols
+
+
+
+skip_last4_rows:
+
+
+
+ ld1 {v2.4h},[x0],#8
+ ld1 {v3.4h},[x9],#8
+ ld1 {v4.4h},[x0],x5
+ ld1 {v5.4h},[x9],x5
+ ld1 {v6.4h},[x0],#8
+ ld1 {v7.4h},[x9],#8
+ ld1 {v8.4h},[x0],x10
+ ld1 {v9.4h},[x9],x10
+
+
+
+ movi v12.4h, #0
+ movi v13.4h, #0
+ movi v16.4h, #0
+ movi v17.4h, #0
+
+
+
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+
+
+ add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+ sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+ add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
+ sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
+
+ add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
+ sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
+
+ add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
+ sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
+
+ add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
+ sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
+
+ sqrshrn v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+
+
+last4_cols:
+
+
+ cmp x12,#0xf0
+ bge skip_last4cols
+
+ smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
+ smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+ smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+
+ smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+ smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+ smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+ smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+ add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7)
+ sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6)
+
+ add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0)
+ sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7)
+
+ add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2)
+ sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5)
+
+ add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1)
+ sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6)
+
+ add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3)
+ sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4)
+
+ sqrshrn v4.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v17.4h, v8.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v5.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v16.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v8.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v13.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v9.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
+ sqrshrn v12.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
+ b end_skip_last4cols
+
+
+
+skip_last4cols:
+
+ umov x15,v25.d[0]
+
+ trn1 v25.4h, v2.4h, v6.4h
+ trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+ trn1 v27.4h, v3.4h, v7.4h
+ trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+ trn1 v6.2s, v29.2s, v31.2s
+ trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+ trn1 v2.2s, v25.2s, v27.2s
+ trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+
+ trn1 v25.4h, v10.4h, v14.4h
+ trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+ trn1 v27.4h, v11.4h, v15.4h
+ trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+ trn1 v10.2s, v25.2s, v27.2s
+ trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+ trn1 v14.2s, v29.2s, v31.2s
+ trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+ mov v25.d[0],x15
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
+
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+
+
+ sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v2.4s, v4.4s , v24.4s
+
+ sub v6.4s, v4.4s , v24.4s
+
+ add v8.4s, v22.4s , v30.4s
+
+ sub v24.4s, v22.4s , v30.4s
+
+ sqrshrn v5.4h, v8.4s,#shift_stage2_idct
+ sqrshrn v2.4h, v2.4s,#shift_stage2_idct
+ sqrshrn v9.4h, v6.4s,#shift_stage2_idct
+ sqrshrn v6.4h, v24.4s,#shift_stage2_idct
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v30.4s, v22.4s , v28.4s
+
+ sub v24.4s, v22.4s , v28.4s
+
+ add v28.4s, v18.4s , v26.4s
+
+ sub v22.4s, v18.4s , v26.4s
+ sqrshrn v4.4h, v30.4s,#shift_stage2_idct
+ sqrshrn v7.4h, v24.4s,#shift_stage2_idct
+ sqrshrn v3.4h, v28.4s,#shift_stage2_idct
+ sqrshrn v8.4h, v22.4s,#shift_stage2_idct
+
+
+
+ umov x19,v25.d[0]
+ umov x20,v25.d[1]
+
+ trn1 v27.4h, v2.4h, v3.4h
+ trn2 v29.4h, v2.4h, v3.4h
+ trn1 v25.4h, v4.4h, v5.4h
+ trn2 v31.4h, v4.4h, v5.4h
+
+ trn1 v2.2s, v27.2s, v25.2s
+ trn2 v4.2s, v27.2s, v25.2s
+ trn1 v3.2s, v29.2s, v31.2s
+ trn2 v5.2s, v29.2s, v31.2s
+
+ trn1 v27.4h, v6.4h, v7.4h
+ trn2 v29.4h, v6.4h, v7.4h
+ trn1 v25.4h, v8.4h, v9.4h
+ trn2 v31.4h, v8.4h, v9.4h
+
+ trn1 v6.2s, v27.2s, v25.2s
+ trn2 v8.2s, v27.2s, v25.2s
+ trn1 v7.2s, v29.2s, v31.2s
+ trn2 v9.2s, v29.2s, v31.2s
+
+ mov v25.d[0],x19
+ mov v25.d[1],x20
+
+ smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+
+ smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+ add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
+
+
+ add x5,x8,x8, lsl #1 //
+
+
+ add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data
+
+
+ add x10,x7,x7, lsl #1 //
+
+ // swapping v3 and v6
+ mov v31.d[0], v3.d[0]
+ mov v3.d[0], v6.d[0]
+ mov v6.d[0], v31.d[0]
+
+ // swapping v5 and v8
+ mov v31.d[0], v5.d[0]
+ mov v5.d[0], v8.d[0]
+ mov v8.d[0], v31.d[0]
+
+
+ sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
+ add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v0.4s, v12.4s , v24.4s
+
+
+ sub v24.4s, v12.4s , v24.4s
+
+
+ add v12.4s, v22.4s , v30.4s
+
+
+ sub v14.4s, v22.4s , v30.4s
+
+ sqrshrn v10.4h, v0.4s,#shift_stage2_idct
+ sqrshrn v17.4h, v24.4s,#shift_stage2_idct
+ sqrshrn v13.4h, v12.4s,#shift_stage2_idct
+ sqrshrn v14.4h, v14.4s,#shift_stage2_idct
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v0.4s, v22.4s , v28.4s
+
+
+ sub v24.4s, v22.4s , v28.4s
+
+
+ add v28.4s, v18.4s , v26.4s
+
+
+ sub v26.4s, v18.4s , v26.4s
+ ld1 {v18.8b},[x2],x8
+
+ sqrshrn v12.4h, v0.4s,#shift_stage2_idct
+ ld1 {v20.8b},[x2],x5
+
+
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct
+ ld1 {v19.8b},[x2],x8
+
+
+
+
+ sqrshrn v11.4h, v28.4s,#shift_stage2_idct
+ ld1 {v22.8b},[x4],x8
+
+
+
+
+ sqrshrn v16.4h, v26.4s,#shift_stage2_idct
+ ld1 {v21.8b},[x2],x5
+
+
+ b pred_buff_addition
+end_skip_last4cols:
+
+
+ umov x19,v25.d[0]
+ umov x20,v25.d[1]
+
+///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
+ trn1 v27.4h, v2.4h, v6.4h
+ trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
+ trn1 v25.4h, v3.4h, v7.4h
+ trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+ trn1 v2.2s, v27.2s, v25.2s
+ trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+ trn1 v6.2s, v29.2s, v31.2s
+ trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+ trn1 v27.4h, v4.4h, v8.4h
+ trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing
+ trn1 v25.4h, v5.4h, v9.4h
+ trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing
+
+ trn1 v4.2s, v27.2s, v25.2s
+ trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
+ trn1 v8.2s, v29.2s, v31.2s
+ trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
+
+ trn1 v27.4h, v10.4h, v14.4h
+ trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
+ trn1 v25.4h, v11.4h, v15.4h
+ trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+ trn1 v10.2s, v27.2s, v25.2s
+ trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+ trn1 v14.2s, v29.2s, v31.2s
+ trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+ trn1 v27.4h, v12.4h, v16.4h
+ trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
+ trn1 v25.4h, v13.4h, v17.4h
+ trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
+
+ trn1 v12.2s, v27.2s, v25.2s
+ trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+ trn1 v16.2s, v29.2s, v31.2s
+ trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+
+ mov v25.d[0],x19
+ mov v25.d[1],x20
+
+ ////step6 operate on first four rows and find their idct
+ ////register usage.extern - storing and idct of rows
+//// cosine constants - d0
+//// sine constants - d1
+//// element 0 first four - d2 - y0
+//// element 1 first four - d6 - y1
+//// element 2 first four - d3 - y2
+//// element 3 first four - d7 - y3
+//// element 4 first four - d4 - y4
+//// element 5 first four - d8 - y5
+//// element 6 first four - d5 - y6
+//// element 7 first four - d9 - y7
+//// element 0 second four - d10 - y0
+//// element 1 second four - d14 - y1
+//// element 2 second four - d11 - y2
+//// element 3 second four - d15 - y3
+//// element 4 second four - d12 - y4
+//// element 5 second four - d16 - y5
+//// element 6 second four - d13 - y6
+//// element 7 second four - d17 - y7
+
+ //// map between first kernel code seq and current
+//// d2 -> d2
+//// d6 -> d6
+//// d3 -> d3
+//// d7 -> d7
+//// d10 -> d4
+//// d14 -> d8
+//// d11 -> d5
+//// d15 -> d9
+//// q3 -> q3
+//// q5 -> q2
+//// q7 -> q4
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+ smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+ sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v2.4s, v4.4s , v24.4s
+
+ sub v6.4s, v4.4s , v24.4s
+
+ add v8.4s, v22.4s , v30.4s
+
+ sub v24.4s, v22.4s , v30.4s
+
+ sqrshrn v5.4h, v8.4s,#shift_stage2_idct
+ sqrshrn v2.4h, v2.4s,#shift_stage2_idct
+ sqrshrn v9.4h, v6.4s,#shift_stage2_idct
+ sqrshrn v6.4h, v24.4s,#shift_stage2_idct
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v30.4s, v22.4s , v28.4s
+
+ sub v24.4s, v22.4s , v28.4s
+
+ add v28.4s, v18.4s , v26.4s
+
+ sub v22.4s, v18.4s , v26.4s
+ sqrshrn v4.4h, v30.4s,#shift_stage2_idct
+ sqrshrn v7.4h, v24.4s,#shift_stage2_idct
+ sqrshrn v3.4h, v28.4s,#shift_stage2_idct
+ sqrshrn v8.4h, v22.4s,#shift_stage2_idct
+
+
+
+ umov x19,v25.d[0]
+ umov x20,v25.d[1]
+
+ trn1 v27.4h, v2.4h, v3.4h
+ trn2 v29.4h, v2.4h, v3.4h
+ trn1 v25.4h, v4.4h, v5.4h
+ trn2 v31.4h, v4.4h, v5.4h
+
+ trn1 v2.2s, v27.2s, v25.2s
+ trn2 v4.2s, v27.2s, v25.2s
+ trn1 v3.2s, v29.2s, v31.2s
+ trn2 v5.2s, v29.2s, v31.2s
+
+ trn1 v27.4h, v6.4h, v7.4h
+ trn2 v29.4h, v6.4h, v7.4h
+ trn1 v25.4h, v8.4h, v9.4h
+ trn2 v31.4h, v8.4h, v9.4h
+
+ trn1 v6.2s, v27.2s, v25.2s
+ trn2 v8.2s, v27.2s, v25.2s
+ trn1 v7.2s, v29.2s, v31.2s
+ trn2 v9.2s, v29.2s, v31.2s
+
+ mov v25.d[0],x19
+ mov v25.d[1],x20
+
+
+
+ smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+ add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
+ smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+ add x5,x8,x8, lsl #1 //
+ smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+ add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data
+ smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ add x10,x7,x7, lsl #1 //
+ smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+ smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+
+ // swapping v3 and v6
+ mov v31.d[0], v3.d[0]
+ mov v3.d[0], v6.d[0]
+ mov v6.d[0], v31.d[0]
+
+ smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ // swapping v5 and v8
+ mov v31.d[0], v5.d[0]
+ mov v5.d[0], v8.d[0]
+ mov v8.d[0], v31.d[0]
+
+ smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+ sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
+ add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v0.4s, v12.4s , v24.4s
+
+
+ sub v24.4s, v12.4s , v24.4s
+
+
+ add v12.4s, v22.4s , v30.4s
+
+
+ sub v14.4s, v22.4s , v30.4s
+
+ sqrshrn v10.4h, v0.4s,#shift_stage2_idct
+ sqrshrn v17.4h, v24.4s,#shift_stage2_idct
+ sqrshrn v13.4h, v12.4s,#shift_stage2_idct
+ sqrshrn v14.4h, v14.4s,#shift_stage2_idct
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v0.4s, v22.4s , v28.4s
+
+
+ sub v24.4s, v22.4s , v28.4s
+
+
+ add v28.4s, v18.4s , v26.4s
+
+
+ sub v26.4s, v18.4s , v26.4s
+ ld1 {v18.8b},[x2],x8
+
+ sqrshrn v12.4h, v0.4s,#shift_stage2_idct
+ ld1 {v20.8b},[x2],x5
+
+
+ sqrshrn v15.4h, v24.4s,#shift_stage2_idct
+ ld1 {v19.8b},[x2],x8
+
+
+
+
+ sqrshrn v11.4h, v28.4s,#shift_stage2_idct
+ ld1 {v22.8b},[x4],x8
+
+
+
+
+ sqrshrn v16.4h, v26.4s,#shift_stage2_idct
+ ld1 {v21.8b},[x2],x5
+
+
+
+
+pred_buff_addition:
+
+ umov x19,v25.d[0]
+ umov x20,v25.d[1]
+
+ trn1 v27.4h, v10.4h, v11.4h
+ trn2 v29.4h, v10.4h, v11.4h
+ trn1 v25.4h, v12.4h, v13.4h
+ trn2 v31.4h, v12.4h, v13.4h
+
+ trn1 v10.2s, v27.2s, v25.2s
+ trn2 v12.2s, v27.2s, v25.2s
+ trn1 v11.2s, v29.2s, v31.2s
+ trn2 v13.2s, v29.2s, v31.2s
+
+ trn1 v27.4h, v14.4h, v15.4h
+ trn2 v29.4h, v14.4h, v15.4h
+ trn1 v25.4h, v16.4h, v17.4h
+ trn2 v31.4h, v16.4h, v17.4h
+
+ trn1 v14.2s, v27.2s, v25.2s
+ trn2 v16.2s, v27.2s, v25.2s
+ trn1 v15.2s, v29.2s, v31.2s
+ trn2 v17.2s, v29.2s, v31.2s
+
+
+ mov v25.d[0],x19
+ mov v25.d[1],x20
+
+
+ ld1 {v24.8b},[x4],x5
+ ld1 {v23.8b},[x4],x8
+ ld1 {v25.8b},[x4],x5
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ mov v6.d[1], v7.d[0]
+ mov v8.d[1], v9.d[0]
+ uaddw v2.8h, v2.8h , v18.8b
+ uaddw v4.8h, v4.8h , v22.8b
+ uaddw v6.8h, v6.8h , v20.8b
+ uaddw v8.8h, v8.8h , v24.8b
+
+ // swapping v11 and v14
+ mov v31.d[0], v11.d[0]
+ mov v11.d[0], v14.d[0]
+ mov v14.d[0], v31.d[0]
+
+ // swapping v13 and v16
+ mov v31.d[0], v13.d[0]
+ mov v13.d[0], v16.d[0]
+ mov v16.d[0], v31.d[0]
+// row values stored in the q register.
+
+//q1 :x0
+//q3: x1
+//q2: x2
+//q4: x3
+//q5: x4
+//q7: x5
+//q6: x6
+//q8: x7
+
+
+
+///// adding the prediction buffer
+
+
+
+
+
+
+
+
+
+ // load prediction data
+
+
+
+
+
+ //adding recon with prediction
+
+
+
+
+ mov v10.d[1], v11.d[0]
+ mov v12.d[1], v13.d[0]
+ mov v14.d[1], v15.d[0]
+ mov v16.d[1], v17.d[0]
+ uaddw v10.8h, v10.8h , v19.8b
+ sqxtun v2.8b, v2.8h
+ uaddw v14.8h, v14.8h , v21.8b
+ sqxtun v4.8b, v4.8h
+ uaddw v12.8h, v12.8h , v23.8b
+ sqxtun v6.8b, v6.8h
+ uaddw v16.8h, v16.8h , v25.8b
+ sqxtun v8.8b, v8.8h
+
+
+
+
+
+
+
+ st1 {v2.8b},[x3],x7
+ sqxtun v10.8b, v10.8h
+ st1 {v6.8b},[x3],x10
+ sqxtun v14.8b, v14.8h
+ st1 {v4.8b},[x0],x7
+ sqxtun v12.8b, v12.8h
+ st1 {v8.8b},[x0],x10
+ sqxtun v16.8b, v16.8h
+
+
+
+
+
+
+
+ st1 {v10.8b},[x3],x7
+ st1 {v14.8b},[x3],x10
+ st1 {v12.8b},[x0],x7
+ st1 {v16.8b},[x0],x10
+
+
+
+
+ // ldmfd sp!,{x4-x12,pc}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
diff --git a/common/arm64/ihevc_mem_fns.s b/common/arm64/ihevc_mem_fns.s
new file mode 100644
index 0000000..6619c6c
--- /dev/null
+++ b/common/arm64/ihevc_mem_fns.s
@@ -0,0 +1,280 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * ,:file
+// * ihevc_mem_fns_neon.s
+// *
+// * ,:brief
+// * Contains function definitions for memory manipulation
+// *
+// * ,:author
+// * Naveen SR
+// *
+// * ,:par List of Functions:
+// * - ihevc_memcpy()
+// * - ihevc_memset_mul_8()
+// * - ihevc_memset_16bit_mul_8()
+// *
+// * ,:remarks
+// * None
+// *
+// *******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* ,:brief
+//* memcpy of a 1d array
+//*
+//* ,:par Description:
+//* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+//*
+//* ,:param[in] pu1_dst
+//* UWORD8 pointer to the destination
+//*
+//* ,:param[in] pu1_src
+//* UWORD8 pointer to the source
+//*
+//* ,:param[in] num_bytes
+//* number of bytes to copy
+//* ,:returns
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_memcpy_mul_8(UWORD8 *pu1_dst,
+// UWORD8 *pu1_src,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => *pu1_src
+// x2 => num_bytes
+
+.text
+.p2align 2
+
+
+ .global ihevc_memcpy_mul_8_av8
+.type ihevc_memcpy_mul_8_av8, %function
+
+ihevc_memcpy_mul_8_av8:
+
+LOOP_NEON_MEMCPY_MUL_8:
+ // Memcpy 8 bytes
+ LD1 {v0.8b},[x1],#8
+ ST1 {v0.8b},[x0],#8
+
+ SUBS x2,x2,#8
+ BNE LOOP_NEON_MEMCPY_MUL_8
+ ret
+
+
+
+//*******************************************************************************
+//*/
+//void ihevc_memcpy(UWORD8 *pu1_dst,
+// UWORD8 *pu1_src,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => *pu1_src
+// x2 => num_bytes
+
+
+
+ .global ihevc_memcpy_av8
+.type ihevc_memcpy_av8, %function
+
+ihevc_memcpy_av8:
+ SUBS x2,x2,#8
+ BLT ARM_MEMCPY
+LOOP_NEON_MEMCPY:
+ // Memcpy 8 bytes
+ LD1 {v0.8b},[x1],#8
+ ST1 {v0.8b},[x0],#8
+
+ SUBS x2,x2,#8
+ BGE LOOP_NEON_MEMCPY
+ CMP x2,#-8
+ BEQ MEMCPY_RETURN
+
+ARM_MEMCPY:
+ ADD x2,x2,#8
+
+LOOP_ARM_MEMCPY:
+ LDRB w3,[x1],#1
+ STRB w3,[x0],#1
+ SUBS x2,x2,#1
+ BNE LOOP_ARM_MEMCPY
+MEMCPY_RETURN:
+ ret
+
+
+
+
+//void ihevc_memset_mul_8(UWORD8 *pu1_dst,
+// UWORD8 value,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => value
+// x2 => num_bytes
+
+.text
+.p2align 2
+
+
+
+ .global ihevc_memset_mul_8_av8
+.type ihevc_memset_mul_8_av8, %function
+
+ihevc_memset_mul_8_av8:
+
+// Assumptions: numbytes is either 8, 16 or 32
+ dup v0.8b,w1
+LOOP_MEMSET_MUL_8:
+ // Memset 8 bytes
+ ST1 {v0.8b},[x0],#8
+
+ SUBS x2,x2,#8
+ BNE LOOP_MEMSET_MUL_8
+
+ ret
+
+
+
+
+//void ihevc_memset(UWORD8 *pu1_dst,
+// UWORD8 value,
+// UWORD8 num_bytes)
+//**************Variables Vs Registers*************************
+// x0 => *pu1_dst
+// x1 => value
+// x2 => num_bytes
+
+
+
+ .global ihevc_memset_av8
+.type ihevc_memset_av8, %function
+
+ihevc_memset_av8:
+ SUBS x2,x2,#8
+ BLT ARM_MEMSET
+ dup v0.8b,w1
+LOOP_NEON_MEMSET:
+ // Memcpy 8 bytes
+ ST1 {v0.8b},[x0],#8
+
+ SUBS x2,x2,#8
+ BGE LOOP_NEON_MEMSET
+ CMP x2,#-8
+ BEQ MEMSET_RETURN
+
+ARM_MEMSET:
+ ADD x2,x2,#8
+
+LOOP_ARM_MEMSET:
+ STRB w1,[x0],#1
+ SUBS x2,x2,#1
+ BNE LOOP_ARM_MEMSET
+
+MEMSET_RETURN:
+ ret
+
+
+
+
+//void ihevc_memset_16bit_mul_8(UWORD16 *pu2_dst,
+// UWORD16 value,
+// UWORD8 num_words)
+//**************Variables Vs Registers*************************
+// x0 => *pu2_dst
+// x1 => value
+// x2 => num_words
+
+.text
+.p2align 2
+
+
+
+ .global ihevc_memset_16bit_mul_8_av8
+.type ihevc_memset_16bit_mul_8_av8, %function
+
+ihevc_memset_16bit_mul_8_av8:
+
+// Assumptions: num_words is either 8, 16 or 32
+
+ // Memset 8 words
+ dup v0.8h,w1
+LOOP_MEMSET_16BIT_MUL_8:
+ ST1 {v0.8h},[x0],#16
+
+ SUBS x2,x2,#8
+ BNE LOOP_MEMSET_16BIT_MUL_8
+
+ ret
+
+
+
+
+//void ihevc_memset_16bit(UWORD16 *pu2_dst,
+// UWORD16 value,
+// UWORD8 num_words)
+//**************Variables Vs Registers*************************
+// x0 => *pu2_dst
+// x1 => value
+// x2 => num_words
+
+
+
+ .global ihevc_memset_16bit_av8
+.type ihevc_memset_16bit_av8, %function
+
+ihevc_memset_16bit_av8:
+ SUBS x2,x2,#8
+ BLT ARM_MEMSET_16BIT
+ dup v0.8h,w1
+LOOP_NEON_MEMSET_16BIT:
+ // Memset 8 words
+ ST1 {v0.8h},[x0],#16
+
+ SUBS x2,x2,#8
+ BGE LOOP_NEON_MEMSET_16BIT
+ CMP x2,#-8
+ BEQ MEMSET_16BIT_RETURN
+
+ARM_MEMSET_16BIT:
+ ADD x2,x2,#8
+
+LOOP_ARM_MEMSET_16BIT:
+ STRH w1,[x0],#2
+ SUBS x2,x2,#1
+ BNE LOOP_ARM_MEMSET_16BIT
+
+MEMSET_16BIT_RETURN:
+ ret
+
+
+
+
+ .section .note.GNU-stack,"",%progbits
+
diff --git a/common/arm64/ihevc_neon_macros.s b/common/arm64/ihevc_neon_macros.s
new file mode 100644
index 0000000..09a1de9
--- /dev/null
+++ b/common/arm64/ihevc_neon_macros.s
@@ -0,0 +1,50 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_neon_macros.s
+//*
+//* @brief
+//* Contains assembly macros
+//*
+//* @author
+//* Naveen SR
+//*
+//* @par List of Functions:
+//*
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+
+
+.macro push_v_regs
+ stp d8,d9,[sp,#-16]!
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+.endm
+.macro pop_v_regs
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+.endm
+
diff --git a/common/arm64/ihevc_padding.s b/common/arm64/ihevc_padding.s
new file mode 100644
index 0000000..5a33d0a
--- /dev/null
+++ b/common/arm64/ihevc_padding.s
@@ -0,0 +1,523 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+// *******************************************************************************
+// * //file
+// * ihevc_padding_neon.s
+// *
+// * //brief
+// * contains function definitions padding
+// *
+// * //author
+// * naveen sr
+// *
+// * //par list of functions:
+// * - ihevc_pad_left_luma()
+// * - ihevc_pad_left_chroma()
+// *
+// * //remarks
+// * none
+// *
+// *******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* padding (luma block) at the left of a 2d array
+//*
+//* //par description:
+//* the left column of a 2d array is replicated for pad_size times at the left
+//*
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //param[in] pad_size
+//* integer -padding size of the array
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//.if pad_left_luma == c
+//void ihevc_pad_left_luma(uword8 *pu1_src,
+// word32 src_strd,
+// word32 ht,
+// word32 pad_size)
+//**************variables vs registers*************************
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+.text
+.align 4
+
+.globl ihevc_pad_left_luma_av8
+
+.type ihevc_pad_left_luma_av8, %function
+
+ihevc_pad_left_luma_av8:
+
+loop_start_luma_left:
+ // pad size is assumed to be pad_left = 80
+ sub x4,x0,x3
+
+ ldrb w8,[x0]
+ add x0,x0,x1
+ ldrb w9,[x0]
+ add x0,x0,x1
+ ldrb w10,[x0]
+ add x0,x0,x1
+ ldrb w11,[x0]
+ add x0,x0,x1
+
+ dup v0.16b,w8
+ dup v2.16b,w9
+ dup v4.16b,w10
+ dup v6.16b,w11
+
+ add x5,x4,x1
+
+ st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4] // 16 bytes store
+
+ add x6,x5,x1
+
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5] //128/8 = 16 bytes store
+
+ add x7,x6,x1
+
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6] //128/8 = 16 bytes store
+
+ subs x2, x2,#4
+
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7] //128/8 = 16 bytes store
+
+ // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+ bne loop_start_luma_left
+
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* padding (chroma block) at the left of a 2d array
+//*
+//* //par description:
+//* the left column of a 2d array is replicated for pad_size times at the left
+//*
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array (each colour component)
+//*
+//* //param[in] pad_size
+//* integer -padding size of the array
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//.if pad_left_chroma == c
+//void ihevc_pad_left_chroma(uword8 *pu1_src,
+// word32 src_strd,
+// word32 ht,
+// word32 pad_size)
+//{
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+
+
+.globl ihevc_pad_left_chroma_av8
+
+.type ihevc_pad_left_chroma_av8, %function
+
+ihevc_pad_left_chroma_av8:
+
+
+loop_start_chroma_left:
+ // pad size is assumed to be pad_left = 80
+ sub x4,x0,x3
+
+ ldrh w8,[x0]
+ add x0,x0,x1
+ ldrh w9,[x0]
+ add x0,x0,x1
+ ldrh w10,[x0]
+ add x0,x0,x1
+ ldrh w11,[x0]
+ add x0,x0,x1
+
+ dup v0.8h,w8
+ dup v2.8h,w9
+ dup v4.8h,w10
+ dup v6.8h,w11
+
+ add x5,x4,x1
+
+ st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4] // 16 bytes store
+
+ add x6,x5,x1
+
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5] //128/8 = 16 bytes store
+
+ add x7,x6,x1
+
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6] //128/8 = 16 bytes store
+
+ subs x2, x2,#4
+
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7] //128/8 = 16 bytes store
+
+ // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+ bne loop_start_chroma_left
+
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* padding (luma block) at the right of a 2d array
+//*
+//* //par description:
+//* the right column of a 2d array is replicated for pad_size times at the right
+//*
+//*
+//* //param[in] pu1_src
+//* uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //param[in] pad_size
+//* integer -padding size of the array
+//*
+//* //param[in] ht
+//* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//.if pad_right_luma == c
+//void ihevc_pad_right_luma(uword8 *pu1_src,
+// word32 src_strd,
+// word32 ht,
+// word32 pad_size)
+//{
+// word32 row//
+//
+// for(row = 0// row < ht// row++)
+// {
+// memset(pu1_src, *(pu1_src -1), pad_size)//
+//
+// pu1_src += src_strd//
+// }
+//}
+//
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+
+
+.globl ihevc_pad_right_luma_av8
+
+.type ihevc_pad_right_luma_av8, %function
+
+ihevc_pad_right_luma_av8:
+
+
+loop_start_luma_right:
+ // pad size is assumed to be pad_left = 80
+ mov x4,x0
+
+ ldrb w8,[x0, #-1]
+ add x0,x0,x1
+ ldrb w9,[x0, #-1]
+ add x0,x0,x1
+ ldrb w10,[x0, #-1]
+ add x0,x0,x1
+ ldrb w11,[x0, #-1]
+ add x0,x0,x1
+
+ add x5,x4,x1
+ add x6,x5,x1
+ add x7,x6,x1
+
+ dup v0.16b,w8
+ dup v2.16b,w9
+ dup v4.16b,w10
+ dup v6.16b,w11
+
+ st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4] // 16 bytes store
+
+
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5] //128/8 = 16 bytes store
+
+ subs x2, x2,#4
+
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6] //128/8 = 16 bytes store
+
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7] //128/8 = 16 bytes store
+
+
+ // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+
+ bne loop_start_luma_right
+
+ ret
+
+
+
+
+
+///**
+//*******************************************************************************
+//*
+//* //brief
+////* padding (chroma block) at the right of a 2d array
+//*
+//* //par description:
+//* the right column of a 2d array is replicated for pad_size times at the right
+//*
+//*
+//* //param[in] pu1_src
+////* uword8 pointer to the source
+//*
+//* //param[in] src_strd
+//* integer source stride
+//*
+//* //param[in] ht
+////* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array (each colour component)
+//*
+//* //param[in] pad_size
+//* integer -padding size of the array
+//*
+//* //param[in] ht
+////* integer height of the array
+//*
+//* //param[in] wd
+//* integer width of the array
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//.if pad_right_chroma == c
+//void ihevc_pad_right_chroma(uword8 *pu1_src,
+// word32 src_strd,
+// word32 ht,
+// word32 pad_size)
+// x0 => *pu1_src
+// x1 => src_strd
+// x2 => ht
+// x3 => pad_size
+
+
+
+.globl ihevc_pad_right_chroma_av8
+
+.type ihevc_pad_right_chroma_av8, %function
+
+ihevc_pad_right_chroma_av8:
+
+
+loop_start_chroma_right:
+ // pad size is assumed to be pad_left = 80
+ mov x4,x0
+
+ ldrh w8,[x0, #-2]
+ add x0,x0,x1
+ ldrh w9,[x0, #-2]
+ add x0,x0,x1
+ ldrh w10,[x0, #-2]
+ add x0,x0,x1
+ ldrh w11,[x0, #-2]
+ add x0,x0,x1
+
+ dup v0.8h,w8
+ dup v2.8h,w9
+ dup v4.8h,w10
+ dup v6.8h,w11
+
+ add x5,x4,x1
+
+ st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4],#16 // 16 bytes store
+ st1 {v0.16b},[x4] // 16 bytes store
+
+ add x6,x5,x1
+
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
+ st1 {v2.16b},[x5] //128/8 = 16 bytes store
+
+ add x7,x6,x1
+
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
+ st1 {v4.16b},[x6] //128/8 = 16 bytes store
+
+ subs x2, x2,#4
+
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
+ st1 {v6.16b},[x7] //128/8 = 16 bytes store
+
+ // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
+
+ bne loop_start_chroma_right
+
+ ret
+
+
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_sao_band_offset_chroma.s b/common/arm64/ihevc_sao_band_offset_chroma.s
new file mode 100644
index 0000000..f67a3de
--- /dev/null
+++ b/common/arm64/ihevc_sao_band_offset_chroma.s
@@ -0,0 +1,430 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_band_offset_chroma.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// WORD32 sao_band_pos_u,
+// WORD32 sao_band_pos_v,
+// WORD8 *pi1_sao_offset_u,
+// WORD8 *pi1_sao_offset_v,
+// WORD32 wd,
+// WORD32 ht)
+//
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left 40
+//x5 => sao_band_pos_u 44
+//x6 => sao_band_pos_v 48
+//x7 => *pi1_sao_offset_u 52
+//x8 => *pi1_sao_offset_v 56
+//x9 => wd 60
+//x10=> ht 64
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gu1_table_band_idx
+.globl ihevc_sao_band_offset_chroma_av8
+
+ihevc_sao_band_offset_chroma_av8:
+ mov x8,#0
+ mov x9,#0
+ mov x10,#0
+
+ ldr x8,[sp,#0]
+ ldr w9,[sp,#8]
+ ldr w10,[sp,#16]
+
+ push_v_regs
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+
+ mov x15,x4 // pu1_src_top_left 40
+ mov x16,x5 // sao_band_pos_u 44
+ mov x17,x6 // sao_band_pos_v 48
+ mov x19,x7 // pi1_sao_offset_u 52
+ mov x20,x8 // pi1_sao_offset_v 56
+ mov x21,x9 // wd 60
+ mov x22,x10 // ht 64
+
+ MOV x4, x15 //Loads pu1_src_top_left
+ MOV x10, x22 //Loads ht
+
+ MOV x9, x21 //Loads wd
+ MOV x11,x10 //Move the ht to x9 for loop counter
+
+ ADD x12,x0,x9 //pu1_src[row * src_strd + (wd)]
+ ADRP x14, :got:gu1_table_band_idx
+ LDR x14, [x14, #:got_lo12:gu1_table_band_idx]
+
+ SUB x12,x12,#2 //wd-2
+
+SRC_LEFT_LOOP:
+ LDRH w5,[x12] //Load the value
+ ADD x12,x12,x1
+ SUBS x11,x11,#1 //Decrement the loop counter
+ STRH w5,[x2],#2 //Store the value in pu1_src_left pointer
+ BNE SRC_LEFT_LOOP
+
+ MOV x5, x16 //Loads sao_band_pos_u
+ LD1 {v1.8b},[x14],#8 //band_table_u.val[0]
+ ADD x12,x3,x9 //pu1_src_top[wd]
+
+ sub x23,x12,#2
+ LDRH w11,[x23]
+ LD1 {v2.8b},[x14],#8 //band_table_u.val[1]
+ LSL x6,x5,#3 //sao_band_pos_u
+
+ STRH w11,[x4] //store to pu1_src_top_left[0]
+ LD1 {v3.8b},[x14],#8 //band_table_u.val[2]
+ MOV x7, x19 //Loads pi1_sao_offset_u
+
+ SUB x4,x10,#1 //ht-1
+ dup v31.8b,w6 //band_pos_u
+ mul x4, x4, x1 //ht-1 * src_strd
+
+ ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd]
+ LD1 {v4.8b},[x14],#8 //band_table_u.val[3]
+ MOV x11,x9 //Move the wd to x9 for loop counter
+
+SRC_TOP_LOOP: //wd is always multiple of 8
+ LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS x11,x11,#8 //Decrement the loop counter by 8
+ ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+
+ LD1 {v30.8b},[x7] //pi1_sao_offset_u load
+ ADD v5.8b, v1.8b , v31.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
+
+ dup v29.8b, v30.8b[1] //vdup_n_u8(pi1_sao_offset_u[1])
+ ADD v6.8b, v2.8b , v31.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
+
+ dup v28.8b, v30.8b[2] //vdup_n_u8(pi1_sao_offset_u[2])
+ ADD v7.8b, v3.8b , v31.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
+
+ dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset_u[3])
+ ADD v8.8b, v4.8b , v31.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
+
+ CMP x5,#28
+ dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset_u[4])
+ ADRP x14, :got:gu1_table_band_idx
+ LDR x14, [x14, #:got_lo12:gu1_table_band_idx]
+
+ movi v30.8b, #16 //vdup_n_u8(16)
+ ADD v1.8b, v5.8b , v29.8b //band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
+
+ LD1 {v9.8b},[x14],#8 //band_table_v.val[0]
+ ADD v2.8b, v6.8b , v28.8b //band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
+
+ LD1 {v10.8b},[x14],#8 //band_table_v.val[1]
+ ADD v3.8b, v7.8b , v27.8b //band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
+
+ MOV x6, x17 //Loads sao_band_pos_v
+ ADD v4.8b, v8.8b , v26.8b //band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
+ LSL x11,x6,#3 //sao_band_pos_v
+
+ BLT SAO_BAND_POS_U_0
+
+SAO_BAND_POS_U_28: //case 28
+ cmhs v13.8b, v30.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16))
+ BNE SAO_BAND_POS_U_29
+
+ ORR v4.8b, v4.8b , v13.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_U
+
+SAO_BAND_POS_U_29: //case 29
+ CMP x5,#29
+
+ cmhs v14.8b, v30.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16))
+ BNE SAO_BAND_POS_U_30
+ ORR v3.8b, v3.8b , v14.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+ AND v4.8b, v4.8b , v13.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_U
+
+SAO_BAND_POS_U_30: //case 30
+ CMP x5,#30
+
+ cmhs v15.8b, v30.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16))
+ BNE SAO_BAND_POS_U_31
+ ORR v2.8b, v2.8b , v15.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+ AND v3.8b, v3.8b , v14.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+
+SAO_BAND_POS_U_31: //case 31
+ CMP x5,#31
+ BNE SWITCH_BREAK_U
+
+ cmhs v16.8b, v30.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ ORR v1.8b, v1.8b , v16.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+ AND v2.8b, v2.8b , v15.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+ B SWITCH_BREAK_U
+
+SAO_BAND_POS_U_0:
+ CMP x5,#0 //case 0
+ BNE SWITCH_BREAK_U
+
+ cmhs v16.8b, v30.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ AND v1.8b, v1.8b , v16.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_U:
+ dup v30.8b,w11 //band_pos_v
+ MOV x8, x20 //Loads pi1_sao_offset_v
+
+ LD1 {v11.8b},[x14],#8 //band_table_v.val[2]
+ ADD v13.8b, v9.8b , v30.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
+
+ LD1 {v12.8b},[x14],#8 //band_table_v.val[3]
+ ADD v14.8b, v10.8b , v30.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
+
+ LD1 {v25.8b},[x8] //pi1_sao_offset_v load
+ ADD v15.8b, v11.8b , v30.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
+
+ dup v29.8b, v25.8b[1] //vdup_n_u8(pi1_sao_offset_v[1])
+ ADD v16.8b, v12.8b , v30.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
+
+ dup v28.8b, v25.8b[2] //vdup_n_u8(pi1_sao_offset_v[2])
+ ADD v9.8b, v13.8b , v29.8b //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
+
+ dup v27.8b, v25.8b[3] //vdup_n_u8(pi1_sao_offset_v[3])
+ ADD v10.8b, v14.8b , v28.8b //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
+
+ dup v26.8b, v25.8b[4] //vdup_n_u8(pi1_sao_offset_v[4])
+ ADD v11.8b, v15.8b , v27.8b //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
+
+ movi v29.8b, #16 //vdup_n_u8(16)
+ ADD v12.8b, v16.8b , v26.8b //band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
+ AND x12,x9,#0xf
+
+ CMP x6,#28
+ BLT SAO_BAND_POS_V_0
+
+SAO_BAND_POS_V_28: //case 28
+ cmhs v17.8b, v29.8b , v12.8b //vcle_u8(band_table.val[3], vdup_n_u8(16))
+ BNE SAO_BAND_POS_V_29
+ ORR v12.8b, v12.8b , v17.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_29: //case 29
+ CMP x6,#29
+
+ cmhs v18.8b, v29.8b , v11.8b //vcle_u8(band_table.val[2], vdup_n_u8(16))
+ BNE SAO_BAND_POS_V_30
+ ORR v11.8b, v11.8b , v18.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+ AND v12.8b, v12.8b , v17.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_30: //case 30
+ CMP x6,#30
+
+ cmhs v19.8b, v29.8b , v10.8b //vcle_u8(band_table.val[1], vdup_n_u8(16))
+ BNE SAO_BAND_POS_V_31
+ ORR v10.8b, v10.8b , v19.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+ AND v11.8b, v11.8b , v18.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_31: //case 31
+ CMP x6,#31
+ BNE SWITCH_BREAK_V
+
+ cmhs v20.8b, v29.8b , v9.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ ORR v9.8b, v9.8b , v20.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+ AND v10.8b, v10.8b , v19.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+ B SWITCH_BREAK_V
+
+SAO_BAND_POS_V_0:
+ CMP x6,#0 //case 0
+ BNE SWITCH_BREAK_V
+
+ cmhs v20.8b, v29.8b , v9.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ AND v9.8b, v9.8b , v20.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK_V:
+ CMP x9,#16
+ MOV x4,x0 //pu1_src_cpy
+ mov v1.d[1],v2.d[0]
+ mov v2.d[0],v3.d[0]
+ mov v2.d[1],v4.d[0]
+ mov v9.d[1],v10.d[0]
+ mov v10.d[0],v11.d[0]
+ mov v10.d[1],v12.d[0]
+ BLT WIDTH_RESIDUE
+
+WIDTH_LOOP: //Width is assigned to be multiple of 16
+ MOV x4,x0 //pu1_src_cpy
+ MOV x11,x10 //move ht
+ ADD x5,x4,x1
+
+HEIGHT_LOOP: //unrolled for 4 rows
+
+ ADD x6,x5,x1
+ LD2 {v5.8b, v6.8b},[x4] //vld1q_u8(pu1_src_cpy)
+ ADD x7,x6,x1
+
+ LD2 {v13.8b, v14.8b},[x5] //vld1q_u8(pu1_src_cpy)
+ SUB v7.8b, v5.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ LD2 {v17.8b, v18.8b},[x6] //vld1q_u8(pu1_src_cpy)
+ SUB v8.8b, v6.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ LD2 {v21.8b, v22.8b},[x7] //vld1q_u8(pu1_src_cpy)
+ SUB v15.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ TBX v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ SUB v16.8b, v14.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ TBX v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ SUB v19.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ TBX v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ SUB v20.8b, v18.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ TBX v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ SUB v23.8b, v21.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ ST2 {v5.8b, v6.8b},[x4] //vst1q_u8(pu1_src_cpy, au1_cur_row)
+ SUB v24.8b, v22.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ SUBS x11,x11,#4 //Decrement the ht loop count by 4
+ TBX v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+
+ ST2 {v13.8b, v14.8b},[x5] //vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+ TBX v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ TBX v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ TBX v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+
+ ST2 {v17.8b, v18.8b},[x6],x1 //vst1q_u8(pu1_src_cpy, au1_cur_row)
+
+ ADD x4,x6,x1
+ ST2 {v21.8b, v22.8b},[x7] //vst1q_u8(pu1_src_cpy, au1_cur_row)
+ ADD x5,x4,x1
+
+ BNE HEIGHT_LOOP
+
+ SUB x9,x9,#16 //Decrement the width loop by 16
+ ADD x0,x0,#16
+ CMP x9,#8
+ BGT WIDTH_LOOP
+ BLT END_LOOP
+ MOV x4,x0 //pu1_src_cpy
+
+WIDTH_RESIDUE: //If width is not multiple of 16
+
+ ADD x5,x4,x1
+ LD2 {v5.8b, v6.8b},[x4] //vld1q_u8(pu1_src_cpy)
+ ADD x6,x5,x1
+
+ ADD x7,x6,x1
+ LD2 {v13.8b, v14.8b},[x5] //vld1q_u8(pu1_src_cpy)
+ SUB v7.8b, v5.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ LD2 {v17.8b, v18.8b},[x6] //vld1q_u8(pu1_src_cpy)
+ SUB v8.8b, v6.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ TBX v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ SUB v15.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ TBX v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ SUB v16.8b, v14.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ LD2 {v21.8b, v22.8b},[x7] //vld1q_u8(pu1_src_cpy)
+ SUB v19.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ TBX v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ SUB v20.8b, v18.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ TBX v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ ZIP1 v28.8b, v5.8b, v6.8b
+ ZIP2 v6.8b, v5.8b, v6.8b
+ mov v5.8b, v28.8b
+
+ TBX v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ SUB v23.8b, v21.8b , v31.8b //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
+
+ ST1 {v5.8b},[x4] //vst1q_u8(pu1_src_cpy, au1_cur_row)
+ ZIP1 v28.8b, v13.8b, v14.8b
+ ZIP2 v14.8b, v13.8b, v14.8b
+ mov v13.8b, v28.8b
+
+ TBX v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ SUB v24.8b, v22.8b , v30.8b //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
+
+ ST1 {v13.8b},[x5] //vst1q_u8(pu1_src_cpy, au1_cur_row)
+ SUBS x10,x10,#4 //Decrement the ht loop count by 4
+
+ TBX v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
+ ZIP1 v28.8b, v17.8b, v18.8b
+ ZIP2 v18.8b, v17.8b, v18.8b
+ mov v17.8b, v28.8b
+
+ TBX v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
+ ST1 {v17.8b},[x6],x1 //vst1q_u8(pu1_src_cpy, au1_cur_row)
+ ZIP1 v28.8b, v21.8b, v22.8b
+ ZIP2 v22.8b, v21.8b, v22.8b
+ mov v21.8b, v28.8b
+
+ ADD x4,x6,x1
+ ST1 {v21.8b},[x7] //vst1q_u8(pu1_src_cpy, au1_cur_row)
+ ADD x5,x4,x1
+
+ BNE WIDTH_RESIDUE
+
+END_LOOP:
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x23, x24,[sp],#16
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_sao_band_offset_luma.s b/common/arm64/ihevc_sao_band_offset_luma.s
new file mode 100644
index 0000000..099d581
--- /dev/null
+++ b/common/arm64/ihevc_sao_band_offset_luma.s
@@ -0,0 +1,245 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_band_offset_luma.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using// ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// WORD32 sao_band_pos,
+// WORD8 *pi1_sao_offset,
+// WORD32 wd,
+// WORD32 ht)
+//
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x5 => sao_band_pos
+//x6 => *pi1_sao_offset
+//x7 => wd
+//x8 => ht
+
+
+.set WIDE_REFERENCE, 0
+.set ARCHITECTURE, 5
+.set DO1STROUNDING, 0
+
+.include "ihevc_neon_macros.s"
+
+.text
+.p2align 2
+
+.globl gu1_table_band_idx
+.globl ihevc_sao_band_offset_luma_av8
+
+ihevc_sao_band_offset_luma_av8:
+
+ // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
+
+ LDR w8,[sp] //Loads ht
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ MOV x9,x8 //Move the ht to x9 for loop counter
+ ADD x10,x0,x7 //pu1_src[row * src_strd + (wd)]
+
+ SUB x10,x10,#1 //wd-1
+ ADRP x14, :got:gu1_table_band_idx
+ LDR x14, [x14, #:got_lo12:gu1_table_band_idx]
+
+SRC_LEFT_LOOP:
+ LDRB w11,[x10]
+ add x10, x10, x1 //Load the value
+ SUBS x9,x9,#1 //Decrement the loop counter
+ STRB w11,[x2],#1 //Store the value in pu1_src_left pointer
+ BNE SRC_LEFT_LOOP
+
+ ADD x9,x3,x7 //pu1_src_top[wd]
+ LD1 {v1.8b},[x14],#8 //band_table.val[0]
+
+ LSL x11,x5,#3
+ LD1 {v2.8b},[x14],#8 //band_table.val[1]
+
+ LDRB w10,[x9,#-1]
+ dup v31.8b,w11 //band_pos
+ SUB x12,x8,#1 //ht-1
+
+ STRB w10,[x4] //store to pu1_src_top_left[0]
+ LD1 {v3.8b},[x14],#8 //band_table.val[2]
+ mul x12, x12, x1 //ht-1 * src_strd
+
+ ADD x4,x12,x0 //pu1_src[(ht - 1) * src_strd]
+ LD1 {v4.8b},[x14],#8 //band_table.val[3]
+ MOV x9,x7 //Move the wd to x9 for loop counter
+
+SRC_TOP_LOOP: //wd is always multiple of 8
+ LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS x9,x9,#8 //Decrement the loop counter by 8
+ ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+
+ LD1 {v30.8b},[x6] //pi1_sao_offset load
+ ADD v5.8b, v1.8b , v31.8b //band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
+
+ dup v29.8b, v30.8b[1] //vdup_n_u8(pi1_sao_offset[1])
+ ADD v6.8b, v2.8b , v31.8b //band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
+
+ dup v28.8b, v30.8b[2] //vdup_n_u8(pi1_sao_offset[2])
+ ADD v7.8b, v3.8b , v31.8b //band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
+
+ dup v27.8b, v30.8b[3] //vdup_n_u8(pi1_sao_offset[3])
+ ADD v8.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
+
+ dup v26.8b, v30.8b[4] //vdup_n_u8(pi1_sao_offset[4])
+ ADD v1.8b, v5.8b , v29.8b //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
+
+ movi v29.8b, #16 //vdup_n_u8(16)
+ ADD v2.8b, v6.8b , v28.8b //band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
+
+ CMP x5,#28
+ ADD v3.8b, v7.8b , v27.8b //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
+
+ ADD v4.8b, v8.8b , v26.8b //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
+ BLT SAO_BAND_POS_0
+
+SAO_BAND_POS_28: //case 28
+
+ cmhs v12.8b, v29.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16))
+
+ BNE SAO_BAND_POS_29
+ ORR v4.8b, v4.8b , v12.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK
+
+SAO_BAND_POS_29: //case 29
+ CMP x5,#29
+ cmhs v11.8b, v29.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16))
+
+ BNE SAO_BAND_POS_30
+ ORR v3.8b, v3.8b , v11.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
+
+ AND v4.8b, v4.8b , v12.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
+ B SWITCH_BREAK
+
+SAO_BAND_POS_30: //case 30
+ CMP x5,#30
+ cmhs v10.8b, v29.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16))
+
+ BNE SAO_BAND_POS_31
+ ORR v2.8b, v2.8b , v10.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
+
+ AND v3.8b, v3.8b , v11.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
+ B SWITCH_BREAK
+
+SAO_BAND_POS_31: //case 31
+ CMP x5,#31
+ BNE SWITCH_BREAK
+
+ cmhs v9.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ ORR v1.8b, v1.8b , v9.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
+
+ AND v2.8b, v2.8b , v10.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
+
+SAO_BAND_POS_0:
+ CMP x5,#0 //case 0
+ BNE SWITCH_BREAK
+
+ cmhs v9.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16))
+ AND v1.8b, v1.8b , v9.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
+
+SWITCH_BREAK:
+
+ mov v1.d[1],v2.d[0]
+ mov v2.d[0],v3.d[0]
+ mov v2.d[1],v4.d[0]
+
+SWITCH_BREAK_1:
+
+ MOV x4,x0 //pu1_src_cpy
+ MOV x11,x8 //move ht
+ ADD x5,x4,x1
+
+HEIGHT_LOOP:
+ ADD x6,x5,x1
+ LD1 {v13.8b},[x4] //au1_cur_row = vld1_u8(pu1_src_cpy)
+
+ ADD x10,x6,x1
+ LD1 {v15.8b},[x5] //au1_cur_row = vld1_u8(pu1_src_cpy)
+
+ LD1 {v17.8b},[x6] //au1_cur_row = vld1_u8(pu1_src_cpy)
+
+ LD1 {v19.8b},[x10] //au1_cur_row = vld1_u8(pu1_src_cpy)
+ SUB v14.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row, band_pos)
+
+ TBX v13.8b, {v1.16b- v2.16b},v14.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ SUB v16.8b, v15.8b , v31.8b //vsub_u8(au1_cur_row, band_pos)
+
+ TBX v15.8b, {v1.16b- v2.16b},v16.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ SUB v18.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row, band_pos)
+
+ TBX v17.8b, {v1.16b- v2.16b},v18.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ SUB v20.8b, v19.8b , v31.8b //vsub_u8(au1_cur_row, band_pos)
+
+ TBX v19.8b, {v1.16b- v2.16b},v20.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
+ ST1 {v13.8b},[x4],x1 //vst1_u8(pu1_src_cpy, au1_cur_row)
+
+ ST1 {v15.8b},[x5] //vst1_u8(pu1_src_cpy, au1_cur_row)
+ SUBS x11,x11,#4 //Decrement the ht loop count by 4
+
+ ST1 {v17.8b},[x6],x1 //vst1_u8(pu1_src_cpy, au1_cur_row)
+
+ ADD x4,x6,x1
+ ST1 {v19.8b},[x10] //vst1_u8(pu1_src_cpy, au1_cur_row)
+ ADD x5,x4,x1
+
+ BNE HEIGHT_LOOP
+
+ SUBS x7,x7,#8 //Decrement the width loop by 8
+ ADD x0,x0,#8
+ BNE SWITCH_BREAK_1
+
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class0.s b/common/arm64/ihevc_sao_edge_offset_class0.s
new file mode 100644
index 0000000..f7d6621
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class0.s
@@ -0,0 +1,345 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class0.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using// ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset,
+// WORD32 wd,
+// WORD32 ht)
+//
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x7 => *pu1_avail
+//x8 => *pi1_sao_offset
+//x9 => wd
+//x10=> ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_av8
+
+ihevc_sao_edge_offset_class0_av8:
+
+
+ // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
+
+ LDR x8, [sp] // pi1_sao_offset
+ LDR x9,[sp,#8] //Loads wd
+ AND x9,x9,0xFFFFFFFF // Since argument is passed as WORD32, Using only lower half of x9
+ LDR x10,[sp,#16] //Loads ht
+ AND x10,x10,0xFFFFFFFF // Since argument is passed as WORD32, Using only lower half of x10
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ movi v2.16b, #2 //const_2 = vdupq_n_s8(2)
+ ADD x11,x3,x9 //pu1_src_top[wd]
+ SUB x11,x11,#1
+
+ movi v4.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ LDRB w12,[x11] //pu1_src_top[wd - 1]
+ ADD x11,x11,#1
+
+ movi v6.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ STRB w12,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1]
+
+ MOV x6,x0 //pu1_src_org
+ LD1 {v10.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ SUB x4,x10,#1 //(ht - 1)
+
+ MOV x12,x9 //Move wd to x12 for loop count
+ LD1 {v11.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset)
+ mul x4, x4, x1 //(ht - 1) * src_strd
+
+ ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd]
+
+SRC_TOP_LOOP: //wd is always multiple of 8
+ LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS x12,x12,#8 //Decrement the loop counter by 8
+ ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+ ADD x6,x6,#15 //pu1_src_org[16 - 1]
+
+ CMP x9,#16 //Compare wd with 16
+ MOV x3,x2 //pu1_src_left backup to reload later
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+ MOV x8,x9 //move wd to x8 for loop count
+
+WIDTH_LOOP_16:
+ CMP x8,x9 //if(col == wd)
+ BNE AU1_MASK_FF //jump to else part
+ LDRB w12,[x7] //pu1_avail[0]
+ mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ B SKIP_AU1_MASK_FF //Skip the else part
+
+AU1_MASK_FF:
+ MOV x12,#0xFF //move -1 to x12
+ mov v8.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+ CMP x8,#16 //If col == 16
+ BNE SKIP_MASKING_IF_NOT16 //If not skip masking
+ LDRB w12,[x7,#1] //pu1_avail[1]
+ mov v8.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+ MOV x12,x0 //pu1_src_cpy = pu1_src
+ MOV x4,x10 //move ht to x4 for loop count
+
+PU1_SRC_LOOP:
+ LDRB w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+ LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ SUB x5,x9,x8 //wd - col
+
+ SUB x14,x10,x4 //ht - row
+ mov v14.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ mul x14, x14, x1 //(ht - row) * src_strd
+
+ LD1 {v26.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ EXT v14.16b, v14.16b , v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col)
+
+ LDRB w11,[x2, #1] //II Iteration load pu1_src_left since ht - row + 1 =1
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ LDRB w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+
+ SUB x4,x4,#1
+ mov v28.8b[15], w11 //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd
+ SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ STRB w14,[x2],#1 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ LDRB w11,[x12,#16] //pu1_src_cpy[16]
+ EXT v28.16b, v28.16b , v26.16b,#15 //II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ SUB x5,x9,x8 //II wd - col
+
+ ADD x12,x12,x1 //Increment the pu1_src pointer by src_strd
+ mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ LDRB w11,[x12,#16] //II pu1_src_cpy[16]
+ EXT v14.16b, v12.16b , v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+ SUB x14,x10,x4 //II ht - row
+
+ cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ SUB x12,x12,x1 //Decrement the pu1_src pointer by src_strd
+
+ mul x14, x14, x1 //II (ht - row) * src_strd
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ ADD x5,x14,x5 //II (ht - row) * src_strd + (wd - col)
+
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ EXT v28.16b, v26.16b , v28.16b,#1 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+ LDRB w14,[x6,x5] //II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
+ SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUBS x4,x4,#1 //Decrement row by 1
+
+ ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+ STRB w14,[x2],#1 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+ Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ SUB v20.16b, v0.16b , v30.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ cmhi v30.16b, v26.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ cmhi v0.16b, v28.16b , v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+// TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ SUB v22.16b, v0.16b , v30.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ TBL v16.16b, {v11.16b},v14.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v0.8h, v26.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
+ ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ SADDW v18.8h, v18.8h , v16.8b
+ TBL v28.16b, {v10.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+// TBL v29.8b, {v10.16b},v29.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ AND v28.16b, v28.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+// TBL v17.8b, {v11.16b},v15.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+ Uxtl2 v14.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ TBL v30.16b, {v11.16b},v28.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ SADDW2 v14.8h, v14.8h , v16.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SMAX v14.8h, v14.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+// TBL v31.8b, {v11.16b},v29.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ UMIN v14.8h, v14.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ SADDW v0.8h, v0.8h , v30.8b
+
+ xtn v19.8b, v14.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ SMAX v0.8h, v0.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ Uxtl2 v28.8h, v26.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ UMIN v0.8h, v0.8h , v6.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v0.8b, v0.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+ SADDW2 v28.8h, v28.8h , v30.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SMAX v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ ST1 {v18.8b, v19.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ UMIN v28.8h, v28.8h , v6.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v1.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 {v0.8b, v1.8b},[x12],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP //If not equal jump to the inner loop
+
+ ADD x0,x0,#16 //pu1_src += 16
+
+ SUBS x8,x8,#16 //Decrement column by 16
+ CMP x8,#8 //Check whether residue remains
+ MOV x2,x3 //Reload pu1_src_left
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BLT END_LOOPS //Jump to end function
+
+WIDTH_RESIDUE:
+ SUB x6,x6,#15
+ AND x8,x9,#0xF //wd_rem = wd & 0xF
+ CMP x8,#0 //Residue check
+ BEQ END_LOOPS //No Residue jump to end function
+
+ CMP x8,x9 //if(wd_rem == wd)
+ BNE AU1_MASK_FF_RESIDUE //jump to else part
+ LDRB w12,[x7] //pu1_avail[0]
+ mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+ MOV x12,#0xFF //move -s to x12
+ mov v8.8b[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+ LDRB w11,[x7,#1] //pu1_avail[1]
+ SUB x5,x9,#1 //wd - 1
+
+ MOV x4,x10 //move ht to x4 for loop count
+ mov v8.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ MOV x12,x0 //pu1_src_cpy = pu1_src
+
+PU1_SRC_LOOP_RESIDUE:
+ LD1 {v12.16b},[x12] //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ LDRB w11,[x2] //load pu1_src_left
+ mov v14.8b[15], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ EXT v14.16b, v14.16b , v12.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ LDRB w11,[x12,#16] //pu1_src_cpy[16]
+ mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ EXT v14.16b, v12.16b , v14.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v24.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+ ADD v24.16b, v24.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ TBL v24.16b, {v10.16b},v24.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v25.8b, {v10.16b},v25.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v24.16b, v24.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ NEG v20.16b, v22.16b //sign_left = vnegq_s8(sign_right)
+ EXT v20.16b, v20.16b , v22.16b,#15 //sign_left = vextq_s8(sign_left, sign_left, 15)
+
+ TBL v26.8b, {v11.16b},v24.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v26.8b
+ SMAX v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUB x14,x10,x4 //ht - row
+ mul x14, x14, x1 //(ht - row) * src_strd
+ ADD x11,x14,x5 //(ht - row) * src_strd + (wd - 1)
+ LDRB w14,[x6, x11] //pu1_src_org[(ht - row) * src_strd + (wd - 1)]
+ STRB w14,[x2],#1 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ ST1 {v28.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ SUBS x4,x4,#1 //Decrement row by 1
+ BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to the pu1_src loop
+
+END_LOOPS:
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
new file mode 100644
index 0000000..d854c62
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
@@ -0,0 +1,483 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class0_chroma.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset_u,
+// WORD8 *pi1_sao_offset_v,
+// WORD32 wd,
+//
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x7 => *pu1_avail
+//x8 => *pi1_sao_offset_u
+//x5 => *pi1_sao_offset_v
+//x9 => wd
+//x10=> ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class0_chroma_av8
+
+ihevc_sao_edge_offset_class0_chroma_av8:
+
+ ldr x8,[sp,#0]
+ ldr x9,[sp,#8]
+ ldr w10,[sp,#16]
+ ldr w11,[sp,#24]
+
+ push_v_regs
+
+ // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+ stp x25, x26,[sp,#-16]!
+
+ mov x15,x4 // *pu1_src_top_left 40
+ mov x16,x5 // *pu1_src_top_right 44
+ mov x17,x6 // *pu1_src_bot_left 48
+ mov x21,x7 // *pu1_avail 52
+ mov x22,x8 // *pi1_sao_offset_u 56
+ mov x23,x9 // *pi1_sao_offset_v 60
+ mov x24,x10 // wd 64
+ mov x25,x11 // ht 68
+
+ MOV x9, x24 //Loads wd
+
+ MOV x4, x15 //Loads pu1_src_top_left
+ ADD x11,x3,x9 //pu1_src_top[wd]
+
+ MOV x10, x25 //Loads ht
+ movi v2.16b, #2 //const_2 = vdupq_n_s8(2)
+ SUB x20,x11,#2
+ LDRH w12,[x20] //pu1_src_top[wd - 1]
+
+ MOV x7, x21 //Loads pu1_avail
+ movi v4.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ STRH w12,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1]
+
+ MOV x8, x22 //Loads pi1_sao_offset_u
+ movi v6.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ SUB x4,x10,#1 //(ht - 1)
+
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ mul x4, x4, x1 //(ht - 1) * src_strd
+
+ MOV x5, x23 //Loads pi1_sao_offset_v
+ LD1 {v11.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset_u)
+ ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd]
+
+ MOV x6,x0 //pu1_src_org
+ LD1 {v10.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOV x12,x9 //Move wd to x12 for loop count
+
+SRC_TOP_LOOP: //wd is always multiple of 8
+ LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col]
+ SUBS x12,x12,#8 //Decrement the loop counter by 8
+ ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col]
+ BNE SRC_TOP_LOOP
+ ADD x6,x6,#14 //pu1_src_org[14]
+
+ MOV x3,x2 //pu1_src_left backup to reload later
+ LD1 {v0.8b},[x5] //offset_tbl = vld1_s8(pi1_sao_offset_v)
+ CMP x9,#16 //Compare wd with 16
+
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+ MOV x8,x9 //move wd to x8 for loop count
+
+WIDTH_LOOP_16:
+ CMP x8,x9 //if(col == wd)
+ BNE AU1_MASK_FF //jump to else part
+ LDRB w12,[x7] //pu1_avail[0]
+ mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v8.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
+ B SKIP_AU1_MASK_FF //Skip the else part
+
+AU1_MASK_FF:
+ MOV x12,#-1 //move -1 to x12
+ mov v8.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF:
+ CMP x8,#16 //If col == 16
+ BNE SKIP_MASKING_IF_NOT16 //If not skip masking
+ LDRB w12,[x7,#1] //pu1_avail[1]
+ mov v8.8b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
+ mov v8.8b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_MASKING_IF_NOT16:
+ MOV x12,x0 //pu1_src_cpy = pu1_src
+ MOV x4,x10 //move ht to x4 for loop count
+
+PU1_SRC_LOOP:
+ LDRH w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
+ LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //SUB x12, x12,#8
+ SUB x5,x9,x8 //wd - col
+
+ SUB x14,x10,x4 //ht - row
+ mov v14.4h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+ mul x14, x14, x1 //(ht - row) * src_strd
+
+ LD1 {v30.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //LD1 {v31.8b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //SUB x12, x12,#8
+ EXT v14.16b, v14.16b , v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+ SUB x12,x12,x1
+
+ LDRH w11,[x2,#2] //II load pu1_src_left since ht - row =0
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col)
+
+ mov v28.4h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ LDRH w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+ SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB x4,x4,#1
+
+ LDRB w11,[x12,#16] //pu1_src_cpy[16]
+ EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
+
+ mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ LDRB w11,[x12,#17] //pu1_src_cpy[17]
+ cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ STRH w14,[x2],#2 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+
+ ADD x12,x12,x1
+ mov v14.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ LDRB w11,[x12,#16] //II pu1_src_cpy[16]
+
+ EXT v14.16b, v12.16b , v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+ mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+ LDRB w11,[x12,#17] //II pu1_src_cpy[17]
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ SUB x12,x12,x1
+
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+
+ SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
+
+ ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+
+ mov v10.d[1],v10.d[0]
+ ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+ TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+// TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v15.d[0],v14.d[1]
+ UZP1 v1.8b, v14.8b, v15.8b
+ UZP2 v15.8b, v14.8b, v15.8b
+ mov v14.8b, v1.8b
+
+ //mov v11.d[1],v0.d[0]
+ //mov v14.d[1],v15.d[0]
+ SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ TBL v16.8b, {v11.16b},v14.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ ADD v24.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
+
+ Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v17.8b, {v0.16b},v15.8b
+ ADD v24.16b, v24.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ //mov v17.d[0],v16.d[1]
+ ZIP1 v1.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.8b, v1.8b
+ TBL v24.16b, {v10.16b},v24.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ Uxtl2 v12.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ //mov v16.d[1],v17.d[0]
+ SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ //TBL v25.8b, {v10.16b},v25.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ AND v24.16b, v24.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v25.d[0],v24.d[1]
+ UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ UZP1 v1.8b, v24.8b, v25.8b
+ UZP2 v25.8b, v24.8b, v25.8b //II
+ mov v24.8b, v1.8b
+
+ //mov v24.d[1],v25.d[0]
+ SADDW v12.8h, v12.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ TBL v26.8b, {v11.16b},v24.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ SMAX v12.8h, v12.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ UMIN v12.8h, v12.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ TBL v27.8b, {v0.16b},v25.8b //II
+ xtn v14.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ //mov v27.d[0],v26.d[1]
+ xtn v15.8b, v12.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ ZIP1 v1.8b, v26.8b, v27.8b
+ ZIP2 v27.8b, v26.8b, v27.8b //II
+ mov v26.8b, v1.8b
+
+ //mov v26.d[1],v27.d[0]
+ SUB x5,x9,x8 //II wd - col
+ Uxtl v28.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SUB x14,x10,x4 //II ht - row
+
+ mul x14, x14, x1 //II (ht - row) * src_strd
+ SADDW v28.8h, v28.8h , v26.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ ADD x5,x14,x5 //II (ht - row) * src_strd + (wd - col)
+
+ LDRH w14,[x6,x5] //II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
+ SMAX v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ STRH w14,[x2],#2 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
+ UMIN v28.8h, v28.8h , v6.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ //mov v31.2d[0],v30.2d[1]
+ Uxtl2 v30.8h, v30.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ SADDW v30.8h, v30.8h , v27.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ ST1 {v14.8b, v15.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ SMAX v30.8h, v30.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SUBS x4,x4,#1 //Decrement row by 1
+ UMIN v30.8h, v30.8h , v6.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn v29.8b, v30.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 {v28.8b, v29.8b},[x12],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP //If not equal jump to the inner loop
+
+ ADD x0,x0,#16 //pu1_src += 16
+
+ SUBS x8,x8,#16 //Decrement column by 16
+ CMP x8,#8 //Check whether residue remains
+ MOV x2,x3 //Reload pu1_src_left
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BLT END_LOOPS //Jump to end function
+
+WIDTH_RESIDUE:
+ SUB x6,x6,#14
+ AND x8,x9,#0xF //wd_rem = wd & 0xF
+ CMP x8,#0 //Residue check
+ BEQ END_LOOPS //No Residue jump to end function
+
+ CMP x8,x9 //if(wd_rem == wd)
+ BNE AU1_MASK_FF_RESIDUE //jump to else part
+ LDRB w12,[x7] //pu1_avail[0]
+ mov v8.8b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ mov v8.8b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
+ B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part
+
+AU1_MASK_FF_RESIDUE:
+ MOV x12,#-1 //move -1 to x12
+ mov v8.4h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+SKIP_AU1_MASK_FF_RESIDUE:
+ LDRB w12,[x7,#1] //pu1_avail[1]
+ mov v8.8b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v8.8b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+ MOV x12,x0 //pu1_src_cpy = pu1_src
+ MOV x4,x10 //move ht to x4 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+ LDRH w11,[x2] //load pu1_src_left
+ LD1 {v12.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //SUB x12, x12,#8
+ SUB x5,x9,#2 //wd - 2
+
+ SUB x14,x10,x4 //(ht - row)
+ mov v14.4h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+ LSL x14,x14,#1 //(ht - row) * 2
+
+ LD1 {v30.16b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //LD1 {v31.8b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
+ //SUB x12, x12,#8
+ EXT v14.16b, v14.16b , v12.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+ SUB x12,x12,x1
+
+ LDRH w11,[x2,#2] //II load pu1_src_left
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ mul x14, x14, x1 //(ht - row) * 2 * src_strd
+
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ mov v28.4h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
+
+ LDRB w11,[x12,#16] //pu1_src_cpy[16]
+ SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ ADD x5,x14,x5 //(ht - row) * 2 * src_strd + (wd - 2)
+
+ mov v14.8b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+ EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
+
+ LDRB w11,[x12,#17] //pu1_src_cpy[17]
+ cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ LDRH w14,[x6, x5] //pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)]
+
+ mov v14.8b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ ADD x12,x12,x1
+
+ STRH w14,[x2],#2 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+ EXT v14.16b, v12.16b , v14.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+ LDRB w11,[x12,#16] //II pu1_src_cpy[16]
+
+ cmhi v16.16b, v12.16b , v14.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ mov v28.8b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
+
+ LDRB w11,[x12,#17] //II pu1_src_cpy[17]
+ cmhi v18.16b, v14.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ SUB x4,x4,#1 //II Decrement row by 1
+
+ SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ mov v28.8b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
+ SUB x12,x12,x1
+
+ ADD v14.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
+ EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
+
+ ADD v14.16b, v14.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ TBL v14.16b, {v10.16b},v14.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
+
+ cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
+ //TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ AND v14.16b, v14.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v15.d[0],v14.d[1]
+ UZP1 v1.8b, v14.8b, v15.8b
+ UZP2 v15.8b, v14.8b, v15.8b
+ mov v14.8b, v1.8b
+
+ ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
+ TBL v16.8b, {v11.16b},v14.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
+
+ Uxtl v18.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v17.8b, {v0.16b},v15.8b
+ Uxtl v24.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ ZIP1 v1.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.8b, v1.8b
+ TBL v28.16b, {v10.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ //TBL v29.8b, {v10.16b},v29.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ AND v28.16b, v28.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v29.d[0],v28.d[1]
+ SUB x5,x9,#2 //II wd - 2
+ UZP1 v1.8b, v28.8b, v29.8b
+ UZP2 v29.8b, v28.8b, v29.8b //II
+ mov v28.8b, v1.8b
+ SUB x14,x10,x4 //II (ht - row)
+
+ LSL x14,x14,#1 //II (ht - row) * 2
+ TBL v26.8b, {v11.16b},v28.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
+ mul x14, x14, x1 //II (ht - row) * 2 * src_strd
+
+ ADD x5,x14,x5 //II (ht - row) * 2 * src_strd + (wd - 2)
+ TBL v27.8b, {v0.16b},v29.8b //II
+ LDRH w14,[x6, x5] //II pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)]
+
+ ZIP1 v1.8b, v26.8b, v27.8b
+ ZIP2 v27.8b, v26.8b, v27.8b //II
+ mov v26.8b, v1.8b
+ ST1 {v18.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ STRH w14,[x2],#2 //II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
+ SADDW v24.8h, v24.8h , v26.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SUBS x4,x4,#1 //Decrement row by 1
+
+ SMAX v24.8h, v24.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v24.8h, v24.8h , v6.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v28.8b, v24.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ ST1 {v28.8b},[x12],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to the pu1_src loop
+
+END_LOOPS:
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x25, x26,[sp],#16
+ ldp x23, x24,[sp],#16
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class1.s b/common/arm64/ihevc_sao_edge_offset_class1.s
new file mode 100644
index 0000000..8ed6169
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class1.s
@@ -0,0 +1,364 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class1.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset,
+// WORD32 wd,
+// WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x5 => *pu1_avail
+//x6 => *pi1_sao_offset
+//x7 => wd
+//x8 => ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_av8
+
+ihevc_sao_edge_offset_class1_av8:
+
+
+ // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
+ MOV x5,x7 //Loads pu1_avail
+
+ LDR x6,[sp] //Loads pi1_sao_offset
+ LDR w7,[sp,#8] //Loads wd
+ LDR w8,[sp,#16] //Loads ht
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ SUB x9,x7,#1 //wd - 1
+ LDRB w10,[x3,x9] //pu1_src_top[wd - 1]
+ STRB w10,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1]
+ ADD x10,x0,x9 //pu1_src[row * src_strd + wd - 1]
+ MOV x11,x2 //Move pu1_src_left pointer to x11
+ MOV x12,x8 //Move ht to x12 for loop count
+SRC_LEFT_LOOP:
+ LDRB w14,[x10] //Load pu1_src[row * src_strd + wd - 1]
+ ADD x10,x10,x1
+ STRB w14,[x11],#1 //pu1_src_left[row]
+ SUBS x12, x12,#1 //Decrement the loop count
+ BNE SRC_LEFT_LOOP //If not equal to 0 jump to the src_left_loop
+
+ SUB x12,x8,#1 //ht - 1
+ mul x12, x12, x1 //(ht - 1) * src_strd
+ ADD x12,x12,x0 //pu1_src[(ht - 1) * src_strd]
+
+ LDRB w4,[x5,#2] //pu1_avail[2]
+ CMP x4,#0 //0 == pu1_avail[2]
+ ADD x20,x0,x1 //pu1_src += src_strd
+ csel x0, x20, x0,EQ
+ SUB x20,x8,#1 //ht--
+ csel x8, x20, x8,EQ
+
+ LDRB w4,[x5,#3] //pu1_avail[3]
+ CMP x4,#0 //0 == pu1_avail[3]
+ SUB x20,x8,#1 //ht--
+ csel x8, x20, x8,EQ
+
+ movi v0.16b, #2 //const_2 = vdupq_n_s8(2)
+ movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ LD1 {v6.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ LD1 {v7.8b},[x6] //offset_tbl = vld1_s8(pi1_sao_offset)
+
+ CMP x7,#16 //Compare wd with 16
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+ LDRB w4,[x5,#2] //pu1_avail[2]
+ CMP x4,#0 //0 == pu1_avail[2]
+ SUB x20,x0,x1 //pu1_src -= src_strd
+ csel x9, x20, x9,EQ
+ csel x9, x3, x9,NE //*pu1_src_top
+
+ MOV x10,x0 //*pu1_src
+
+ LD1 {v8.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
+
+ LD1 {v30.16b},[x12],#16 //vld1q_u8(pu1_src[(ht - 1) * src_strd])
+ cmhi v12.16b, v10.16b , v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ ST1 { v30.16b},[x3],#16 //vst1q_u8(pu1_src_top[col])
+ cmhi v14.16b, v8.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x11,x8 //move ht to x11 for loop count
+
+PU1_SRC_LOOP:
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
+
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB x10,x10,x1
+
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
+ TBL v12.16b, {v6.16b},v12.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ SUB v8.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+// TBL v13.8b, {v6.16b},v13.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+
+
+ NEG v16.16b, v8.16b //II sign_up = vnegq_s8(sign_down)
+ TBL v12.16b, {v7.16b},v12.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ ADD v22.16b, v22.16b , v8.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+
+ Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+// TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+
+ Uxtl2 v8.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+// TBL v13.8b, {v7.16b},v13.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+
+ SADDW2 v8.8h, v8.8h , v12.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ SMAX v8.8h, v8.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ UMIN v8.8h, v8.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+// TBL v25.8b, {v7.16b},v23.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+ xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ xtn2 v20.16b, v8.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ SADDW2 v28.8h, v28.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ ST1 { v20.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUBS x11,x11,#2 //II Decrement the ht loop count by 1
+ xtn2 v30.16b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 { v30.16b},[x10],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ PU1_SRC_LOOP_END //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+ CMP x11,#1 //checking any residue remains
+ BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP
+
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB x10,x10,x1
+
+ ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v22.16b, v22.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v22.16b, {v6.16b},v22.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v23.8b, {v6.16b},v23.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ TBL v24.16b, {v7.16b},v22.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+// TBL v25.8b, {v7.16b},v23.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW2 v28.8h, v28.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v30.8b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn2 v30.16b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 { v30.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+ mov v10.16b, v18.16b //pu1_cur_row = pu1_next_row
+ SUBS x7,x7,#16 //Decrement the wd loop count by 16
+ CMP x7,#8 //Check whether residue remains
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BLT END_LOOPS //Jump to end function
+
+
+WIDTH_RESIDUE:
+ LDRB w4,[x5,#2] //pu1_avail[2]
+ CMP x4,#0 //0 == pu1_avail[2]
+ SUB x20,x0,x1 //pu1_src -= src_strd
+ csel x9, x20, x9,EQ
+ csel x9, x3, x9,NE //*pu1_src_top
+ MOV x10,x0
+
+ LD1 {v8.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
+
+ LD1 {v30.8b},[x12] //vld1_u8(pu1_src[(ht - 1) * src_strd])
+ ST1 {v30.8b},[x3] //vst1_u8(pu1_src_top[col])
+
+ cmhi v12.16b, v10.16b , v8.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v14.16b, v8.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x11,x8 //move ht to x11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
+
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB x10,x10,x1
+
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+ ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+ NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
+ TBL v12.8b, {v6.16b},v12.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SUB v20.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ TBL v12.8b, {v7.16b},v12.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ NEG v16.16b, v20.16b //II sign_up = vnegq_s8(sign_down)
+
+ ADD v22.16b, v22.16b , v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+ Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ TBL v22.8b, {v6.16b},v22.8b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ TBL v24.8b, {v7.16b},v22.8b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ ST1 {v20.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUBS x11,x11,#2 //Decrement the ht loop count by 1
+ ST1 {v30.8b},[x10],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ END_LOOPS
+ CMP x11,#1
+ BGT PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP
+
+
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB x10,x10,x1
+
+ ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v22.16b, v22.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v22.8b, {v6.16b},v22.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ TBL v24.8b, {v7.16b},v22.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v30.8b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ ST1 {v30.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
new file mode 100644
index 0000000..4baa5bf
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
@@ -0,0 +1,467 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class1_chroma.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class1_chroma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset_u,
+// WORD8 *pi1_sao_offset_v,
+// WORD32 wd,
+// WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x5 => *pu1_avail
+//x6 => *pi1_sao_offset_u
+//x7 => *pi1_sao_offset_v
+//x8 => wd
+//x9 => ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class1_chroma_av8
+
+ihevc_sao_edge_offset_class1_chroma_av8:
+
+
+ ldr x8,[sp,#0]
+ ldr x9,[sp,#8]
+ ldr w10,[sp,#16]
+ ldr w11,[sp,#24]
+
+
+ push_v_regs
+ // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+ stp x25, x26,[sp,#-16]!
+
+ mov x15,x4 // *pu1_src_top_left 40
+ mov x16,x5 // *pu1_src_top_right 44
+ mov x17,x6 // *pu1_src_bot_left 48
+ mov x21,x7 // *pu1_avail 52
+ mov x22,x8 // *pi1_sao_offset_u 56
+ mov x23,x9 // *pi1_sao_offset_v 60
+ mov x24,x10 // wd 64
+ mov x25,x11 // ht 68
+
+ mov x4,x15
+ mov x5,x21
+ mov x6,x22
+ mov x7,x23
+ mov x8,x24
+ mov x9,x25
+
+ SUB x10,x8,#2 //wd - 2
+ LDRH w11,[x3,x10] //pu1_src_top[wd - 2]
+ STRH w11,[x4] //*pu1_src_top_left = pu1_src_top[wd - 2]
+ ADD x11,x0,x10 //pu1_src[row * src_strd + wd - 2]
+ MOV x12,x2 //Move pu1_src_left pointer to x11
+ MOV x14,x9 //Move ht to x14 for loop count
+SRC_LEFT_LOOP:
+ LDRH w10,[x11] //Load pu1_src[row * src_strd + wd - 2]
+ ADD x11,x11,x1
+ STRH w10,[x12],#2 //pu1_src_left[row]
+ SUBS x14, x14,#1 //Decrement the loop count
+ BNE SRC_LEFT_LOOP //If not equal to 0 jump to the src_left_loop
+
+ SUB x12,x9,#1 //ht - 1
+ mul x12, x12, x1 //(ht - 1) * src_strd
+ ADD x12,x12,x0 //pu1_src[(ht - 1) * src_strd]
+
+ LDRB w4,[x5,#2] //pu1_avail[2]
+ CMP x4,#0 //0 == pu1_avail[2]
+ ADD x20,x0,x1 //pu1_src += src_strd
+ csel x0, x20, x0,EQ
+ SUB x20,x9,#1 //ht--
+ csel x9, x20, x9,EQ
+
+ LDRB w4,[x5,#3] //pu1_avail[3]
+ CMP x4,#0 //0 == pu1_avail[3]
+ SUB x20,x9,#1 //ht--
+ csel x9, x20, x9,EQ
+
+ movi v0.16b, #2 //const_2 = vdupq_n_s8(2)
+ movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ LD1 {v6.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ LD1 {v7.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+ LD1 {v8.8b},[x7] //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+
+ CMP x8,#16 //Compare wd with 16
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+
+WIDTH_LOOP_16:
+ LDRB w4,[x5,#2] //pu1_avail[2]
+ CMP x4,#0 //0 == pu1_avail[2]
+ SUB x20,x0,x1 //pu1_src -= src_strd
+ csel x11, x20, x11,EQ
+ csel x11, x3, x11,NE //*pu1_src_top
+
+ MOV x10,x0 //*pu1_src
+
+ LD1 {v28.16b},[x11],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ //LD1 {v29.8b},[x11],#8 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ LD1 {v10.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v11.8b},[x0],#8 //pu1_cur_row = vld1q_u8(pu1_src)
+
+ LD1 {v30.16b},[x12],#16 //vld1q_u8(pu1_src[(ht - 1) * src_strd])
+ //LD1 {v31.8b},[x12],#8 //vld1q_u8(pu1_src[(ht - 1) * src_strd])
+ cmhi v12.16b, v10.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ ST1 { v30.16b},[x3],#16 //vst1q_u8(pu1_src_top[col])
+ cmhi v14.16b, v28.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x11,x9 //move ht to x11 for loop count
+
+PU1_SRC_LOOP:
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x10, x10,#8
+ ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
+
+ //mov v19.d[0],v18.d[1]
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v31.8b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x6, x6,#8
+
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB x10,x10,x1
+
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ mov v16.d[1],v16.d[0]
+ NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
+ TBL v12.16b, {v6.16b},v12.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ SUB v28.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ //TBL v13.8b, {v6.16b},v13.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+
+ mov v13.d[0], v12.d[1]
+ UZP1 v27.8b, v12.8b, v13.8b
+ UZP2 v13.8b, v12.8b, v13.8b
+ mov v12.8b,v27.8b
+ NEG v16.16b, v28.16b //II sign_up = vnegq_s8(sign_down)
+ TBL v12.8b, {v7.16b},v12.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ ADD v22.16b, v22.16b , v28.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v13.8b, {v8.16b},v13.8b
+ ZIP1 v27.8b, v12.8b, v13.8b
+ ZIP2 v13.8b, v12.8b, v13.8b
+ mov v12.8b,v27.8b
+
+ SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ //TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ mov v23.d[0], v22.d[1]
+ UZP1 v27.8b, v22.8b, v23.8b
+ UZP2 v23.8b, v22.8b, v23.8b
+ mov v22.8b,v27.8b
+
+ Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ //VTBL.8 D13,D7,D13 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+
+ SADDW v28.8h, v28.8h , v13.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ TBL v24.8b, {v7.16b},v22.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ TBL v25.8b, {v8.16b},v23.8b
+ ZIP1 v27.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v27.8b
+ //VTBL.8 D24,D7,D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ //VTBL.8 D25,D7,D23 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+
+ xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ xtn2 v20.16b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ ST1 { v20.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUBS x11,x11,#2 //II Decrement the ht loop count by 1
+ xtn2 v30.16b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 { v30.16b},[x10],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ PU1_SRC_LOOP_END //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
+ CMP x11,#1 //checking any residue remains
+ BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP
+
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x10, x10,#8
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB x10,x10,x1
+
+ ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v22.16b, v22.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v22.16b, {v6.16b},v22.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ //TBL v23.8b, {v6.16b},v23.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ mov v23.d[0],v22.d[1]
+ UZP1 v27.8b, v22.8b, v23.8b
+ UZP2 v23.8b, v22.8b, v23.8b
+ mov v22.8b,v27.8b
+ TBL v24.8b, {v7.16b},v22.8b
+ TBL v25.8b, {v8.16b},v23.8b
+ ZIP1 v27.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v27.8b
+
+ //VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ //VTBL.8 D25,D7,D23 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ Uxtl2 v28.8h, v10.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v30.8b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn2 v30.16b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 { v30.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+PU1_SRC_LOOP_END:
+ mov v10.16b, v18.16b //pu1_cur_row = pu1_next_row
+ SUBS x8,x8,#16 //Decrement the wd loop count by 16
+ CMP x8,#8 //Check whether residue remains
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BLT END_LOOPS //Jump to end function
+
+
+WIDTH_RESIDUE:
+ LDRB w4,[x5,#2] //pu1_avail[2]
+ CMP x4,#0 //0 == pu1_avail[2]
+ SUB x20,x0,x1 //pu1_src -= src_strd
+ csel x11, x20, x11,EQ
+ csel x11, x3, x11,NE //*pu1_src_top
+ MOV x10,x0
+
+ LD1 {v28.16b},[x11] //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ //LD1 {v29.8b},[x11],#8 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
+ LD1 {v10.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v11.8b},[x0],#8 //pu1_cur_row = vld1q_u8(pu1_src)
+
+ LD1 {v30.8b},[x12] //vld1_u8(pu1_src[(ht - 1) * src_strd])
+ ST1 {v30.8b},[x3] //vst1_u8(pu1_src_top[col])
+
+ cmhi v12.16b, v10.16b , v28.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v14.16b, v28.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v16.16b, v14.16b , v12.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x11,x9 //move ht to x11 for loop count
+
+PU1_SRC_LOOP_RESIDUE:
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x10, x10,#8
+ ADD x6,x10,x1 //II Iteration *pu1_src + src_strd
+
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v31.8b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x6, x6,#8
+
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB x10,x10,x1
+
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ ADD v12.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
+
+ ADD v12.16b, v12.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
+
+ NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down)
+ TBL v12.8b, {v6.16b},v12.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SUB v20.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ UZP1 v27.8b, v12.8b, v13.8b
+ UZP2 v13.8b, v12.8b, v13.8b
+ mov v12.8b,v27.8b
+
+ ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ TBL v12.8b, {v7.16b},v12.8b
+ NEG v16.16b, v20.16b //II sign_up = vnegq_s8(sign_down)
+
+ TBL v13.8b, {v8.16b},v13.8b
+ ZIP1 v27.8b, v12.8b, v13.8b
+ ZIP2 v13.8b, v12.8b, v13.8b
+ mov v12.8b,v27.8b
+
+ //VTBL.8 D12,D7,D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ ADD v22.16b, v22.16b , v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+ Uxtl v20.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ SADDW v20.8h, v20.8h , v12.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ TBL v22.8b, {v6.16b},v22.8b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ UZP1 v27.8b, v22.8b, v23.8b
+ UZP2 v23.8b, v22.8b, v23.8b
+ mov v22.8b,v27.8b
+
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ TBL v24.8b, {v7.16b},v22.8b
+ xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ TBL v25.8b, {v8.16b},v23.8b
+ ZIP1 v27.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v27.8b
+ //VTBL.8 D24,D7,D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ mov v10.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ ST1 {v20.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUBS x11,x11,#2 //Decrement the ht loop count by 1
+ ST1 {v30.8b},[x10],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BEQ END_LOOPS
+ CMP x11,#1
+ BGT PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP
+
+
+ ADD x10,x10,x1 //*pu1_src + src_strd
+ LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v19.8b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x10, x10,#8
+ cmhi v12.16b, v10.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row)
+ cmhi v14.16b, v18.16b , v10.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
+ SUB v20.16b, v14.16b , v12.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB x10,x10,x1
+
+ ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v22.16b, v22.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v22.8b, {v6.16b},v22.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ UZP1 v27.8b, v22.8b, v23.8b
+ UZP2 v23.8b, v22.8b, v23.8b
+ mov v22.8b,v27.8b
+
+ TBL v24.8b, {v7.16b},v22.8b
+ TBL v25.8b, {v8.16b},v23.8b
+ ZIP1 v27.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v27.8b
+
+ //VTBL.8 D24,D7,D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v26.8h, v10.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v30.8b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ ST1 {v30.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+END_LOOPS:
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x25, x26,[sp],#16
+ ldp x23, x24,[sp],#16
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class2.s b/common/arm64/ihevc_sao_edge_offset_class2.s
new file mode 100644
index 0000000..3350e5c
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@@ -0,0 +1,846 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class2.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset,
+// WORD32 wd,
+// WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x5 => *pu1_avail
+//x6 => *pi1_sao_offset
+//x7 => wd
+//x8=> ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_av8
+
+ihevc_sao_edge_offset_class2_av8:
+
+
+ // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments
+ MOV x5,x7 //Loads pu1_avail
+
+ LDR x6,[sp] //Loads pi1_sao_offset
+ LDR w7,[sp,#8] //Loads wd
+ LDR w8,[sp,#16] //Loads ht
+
+ MOV x16,x7 // wd
+ MOV x17,x8 // ht
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+
+ SUB x9,x7,#1 //wd - 1
+
+ LDRB w10,[x3,x9] //pu1_src_top[wd - 1]
+
+ MOV x19,x0 //Store pu1_src in sp
+ MOV x21,x2 //Store pu1_src_left in sp
+ MOV x22,x3 //Store pu1_src_top in sp
+ MOV x23,x5 //Store pu1_avail in sp
+ MOV x24,x4 //Store pu1_src_top_left in sp
+
+
+ MOV x9,x7 //Move width to x9 for loop count
+
+ SUB sp,sp,#0xA0 //Decrement the stack pointer to store some temp arr values
+
+ STRB w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 1]
+ SUB x10,x8,#1 //ht-1
+ madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col]
+ ADD x12,sp,#0x02 //temp array
+
+AU1_SRC_TOP_LOOP:
+ LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col]
+ SUBS x9,x9,#8 //Decrement the loop count by 8
+ ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP:
+ LDRB w10,[x5,#4] //pu1_avail[4]
+ CMP x10,#0
+ LDRB w9,[x0] //u1_pos_0_0_tmp = pu1_src[0]
+ BEQ PU1_AVAIL_7_LOOP
+
+ LDRB w11,[x4] //pu1_src_top_left[0]
+ ADD x14,x0,x1 //pu1_src + src_strd
+
+ SUBS x12,x9,x11 //pu1_src[0] - pu1_src_top_left[0]
+ LDRB w4,[x14,#1] //pu1_src[1 + src_strd]
+
+ movn x20,#0
+ csel x12, x20, x12,LT
+ MOV x20,#1
+ csel x12, x20, x12,GT //SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ SUBS x11,x9,x4 //pu1_src[0] - pu1_src[1 + src_strd]
+
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+ ADD x4,x12,x11 //SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[1 + src_strd])
+ ADD x4,x4,#2 //edge_idx
+
+ LDRSB x12,[x14,x4] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0 //0 != edge_idx
+ BEQ PU1_AVAIL_7_LOOP
+ LDRSB x10,[x6,x12] //pi1_sao_offset[edge_idx]
+ ADD x9,x9,x10 //pu1_src[0] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x9,x20
+ csel x9, x20, x9, ge //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP:
+ LDRB w14,[x5,#7] //pu1_avail[7]
+ CMP x14,#0
+ SUB x10,x7,#1 //wd - 1
+ SUB x11,x8,#1 //ht - 1
+ madd x12, x11, x1, x10 //wd - 1 + (ht - 1) * src_strd
+ ADD x12,x12,x0 //pu1_src[wd - 1 + (ht - 1) * src_strd]
+ LDRB w10,[x12] //u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
+ BEQ PU1_AVAIL
+
+ SUB x4,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+ SUB x4,x4,#1
+ LDRB w11,[x4] //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
+ ADD x4,x4,#1
+ ADD x14,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+
+ SUBS x11,x10,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
+ LDRB w4,[x14,#1] //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
+
+ SUBS x4,x10,x4 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
+ movn x20,#0
+ csel x4, x20, x4,LT
+ MOV x20,#1
+ csel x4, x20, x4,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+ ADD x11,x11,x4 //Add 2 sign value
+ ADD x11,x11,#2 //edge_idx
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0
+ BEQ PU1_AVAIL
+ LDRSB x11,[x6,x12] //pi1_sao_offset[edge_idx]
+ ADD x10,x10,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x10,x20
+ csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL:
+ MOV x12,x8 //Move ht
+ movi v0.16b, #2 //const_2 = vdupq_n_s8(2)
+ LDRB w11,[x5,#3] //pu1_avail[3]
+
+ MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy
+ movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ CMP x11,#0
+
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ SUB x20,x12,#1 //ht_tmp--
+ csel x12, x20, x12,EQ
+
+ CMP x5,#0
+ LD1 {v7.8b},[x6] //offset_tbl = vld1_s8(pi1_sao_offset)
+ ADRP x11, :got:gi1_table_edge_idx //table pointer
+ LDR x11, [x11, #:got_lo12:gi1_table_edge_idx]
+
+
+ ADD x20,x0,x1 //pu1_src += src_strd
+ csel x0, x20, x0,EQ
+ LD1 {v6.8b},[x11] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ SUB x20,x12,#1 //ht_tmp--
+ csel x12, x20, x12,EQ
+
+ MOV x6,x7 //move wd to x6 loop_count
+ movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ ADD x20,x14,#1 //pu1_src_left_cpy += 1
+ csel x14, x20, x14,EQ
+
+ MOV x15,x0
+ CMP x7,#16 //Compare wd with 16
+
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP x8,#4 //Compare ht with 4
+ BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ MOV x7,x16 //Loads wd
+
+ MOV x5,x23 //Loads pu1_avail
+ CMP x6,x7 //col == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+ MOV x20,#-1
+ csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ CMP x6,#16 //if(col == 16)
+ BNE SKIP_AU1_MASK_VAL
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ LDRB w11,[x5,#2] //pu1_avail[2]
+ CMP x11,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ csel x8, x3, x8,NE //pu1_src_top_cpy
+ SUB x8,x8,#1 //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+ MOV x7,x16 //Loads wd
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ ADD x3,x3,#16
+
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ MOV x4,x17 //Loads ht
+
+ SUB x7,x7,x6 //(wd - col)
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ MOV x8,x19 //Loads *pu1_src
+
+ ADD x7,x7,#15 //15 + (wd - col)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+ SUB x5,x5,#1
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP:
+ LDRB w8,[x7] //load the value and increment by src_strd
+ ADD x7,x7,x1
+ STRB w8,[x5,#1]! //store it in the stack pointer
+ SUBS x4,x4,#1 //decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP
+
+ ADD x8,x0,x1 //I Iteration *pu1_src + src_strd
+ movi v18.16b, #0
+ MOV x4,x23 //I Loads pu1_avail
+
+ MOV x7,x12 //row count, move ht_tmp to x7
+ LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ LDRB w4,[x4,#2] //I pu1_avail[2]
+
+ LDRB w5,[x8,#16] //I pu1_src_cpy[src_strd + 16]
+ mov v18.8b[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+ EXT v18.16b, v16.16b , v18.16b,#1 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ CMP x4,#0 //I
+ BNE SIGN_UP_CHANGE_DONE //I
+
+SIGN_UP_CHANGE:
+ SUB x2,x12,x7 //I ht_tmp - row
+ LDRB w11,[x0] //I pu1_src_cpy[0]
+ ADD x2,x14,x2 //I pu1_src_left_cpy[ht_tmp - row]
+ SUB x2,x2,#1
+ LDRB w5,[x2] //I load the value
+ ADD x2,x2,#1
+ SUBS x4,x11,x5 //I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ movn x20,#0
+ csel x4, x20, x4,LT //I
+ MOV x20,#1
+ csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ mov v14.8b[0], w4 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE:
+ cmhi v10.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD v24.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+
+ cmhi v18.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v10.16b, v18.16b , v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v24.16b, v24.16b , v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v18.16b, {v6.16b},v24.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v19.8b, {v6.16b},v25.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ NEG v14.16b, v10.16b //I sign_up = vnegq_s8(sign_down)
+ TBL v10.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+// TBL v11.8b, {v7.16b},v19.8b //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ SADDW v20.8h, v20.8h , v10.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ Uxtl2 v22.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row
+
+ SADDW2 v22.8h, v22.8h , v10.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SMAX v22.8h, v22.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1
+
+ UMIN v22.8h, v22.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn2 v20.16b, v22.8h //I vmovn_s16(pi2_tmp_cur_row.val[1])
+
+PU1_SRC_LOOP:
+
+ ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ ADD x8,x0,x1 //II iteration *pu1_src + src_strd
+
+ LD1 {v16.16b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ ADD x11,x8,x1 //III iteration *pu1_src + src_strd
+
+ LDRB w5,[x8,#16] //II pu1_src_cpy[src_strd + 16]
+ LD1 {v30.16b},[x11] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ LDRB w4,[x0] //II pu1_src_cpy[0]
+
+ LDRB w8,[x11,#16] //III pu1_src_cpy[src_strd + 16]
+ mov v28.8b[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+
+ SUB x5,x12,x7 //II ht_tmp - row
+ EXT v22.16b, v16.16b , v28.16b,#1 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ ADD x5,x14,x5 //II pu1_src_left_cpy[ht_tmp - row]
+
+ SUB x5,x5,#1
+ LDRB w5,[x5] //II load the value
+ mov v18.8b[0], w8 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1
+
+ SUBS x4,x4,x5 //II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ EXT v18.16b, v30.16b , v18.16b,#1 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ LDRB w2,[x0,x1] //III pu1_src_cpy[0]
+
+ cmhi v24.16b, v12.16b , v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB x5,x12,x7 //III ht_tmp - row
+
+ movn x20,#0
+ csel x4, x20, x4,LT //II
+ cmhi v22.16b, v22.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD x5,x14,x5 //III pu1_src_left_cpy[ht_tmp - row]
+
+ MOV x20,#1
+ csel x4, x20, x4,GT //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ SUB v24.16b, v22.16b , v24.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ SUB x5,x5,#1
+ LDRB w5,[x5] //III load the value
+
+ SUBS x2,x2,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ mov v14.8b[0], w4 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+ movn x20,#0
+ csel x2, x20, x2,LT //III
+ cmhi v10.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ MOV x20,#1
+ csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+
+ ADD v22.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v22.16b, v22.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+
+ SUB v10.16b, v18.16b , v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+// TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ AND v22.16b, v22.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v14.8b[0], w2 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+ ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+ TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ ADD v18.16b, v18.16b , v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ Uxtl v26.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v18.16b, {v6.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v10.16b //III sign_up = vnegq_s8(sign_down)
+
+ SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+// TBL v19.8b, {v6.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+ Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ TBL v10.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ SADDW v20.8h, v20.8h , v10.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+// TBL v25.8b, {v7.16b},v23.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ Uxtl2 v28.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SADDW2 v28.8h, v28.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+// TBL v11.8b, {v7.16b},v19.8b //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row
+ xtn v26.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ xtn2 v26.16b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+ SADDW2 v18.8h, v18.8h , v10.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1
+ UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ CMP x7,#1 //III
+
+ ST1 { v26.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ BGT PU1_SRC_LOOP //III If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+ ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ ADD x8,x0,x1 //*pu1_src + src_strd
+
+ LDRB w2,[x0] //pu1_src_cpy[0]
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ LDRB w5,[x8,#16] //pu1_src_cpy[src_strd + 16]
+
+ SUB x11,x12,x7 //ht_tmp - row
+ mov v18.8b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ ADD x11,x14,x11 //pu1_src_left_cpy[ht_tmp - row]
+
+ SUB x11,x11,#1
+ LDRB w5,[x11] //load the value
+ ADD x11,x11,#1
+ EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+ SUBS x4,x2,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+
+ cmhi v10.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ movn x20,#0
+ csel x4, x20, x4,LT
+
+ MOV x20,#1
+ csel x4, x20, x4,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ cmhi v18.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ mov v14.8b[0], w4 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+ SUB v10.16b, v18.16b , v10.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v18.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v18.16b , v10.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ TBL v18.16b, {v6.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v10.16b //sign_up = vnegq_s8(sign_down)
+
+// TBL v19.8b, {v6.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ AND v18.16b, v18.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ TBL v10.16b, {v7.16b},v18.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+// TBL v11.8b, {v7.16b},v19.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ SADDW v20.8h, v20.8h , v10.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ Uxtl2 v12.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ SADDW2 v12.8h, v12.8h , v10.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SMAX v12.8h, v12.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ UMIN v12.8h, v12.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ xtn2 v20.16b, v12.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+INNER_LOOP_DONE:
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ ST1 { v20.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ MOV x2,x21 //Loads *pu1_src_left
+
+ MOV x8,x17 //Loads ht
+ SUB x5,x5,#1
+
+ SUB x2,x2,#1
+SRC_LEFT_LOOP:
+ LDRB w7,[x5,#1]! //au1_src_left_tmp[row]
+ SUBS x8,x8,#1
+ STRB w7,[x2,#1]! //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUB x6,x6,#16 //Decrement the wd loop count by 16
+ CMP x6,#8 //Check whether residue remains
+ BLT RE_ASSINING_LOOP //Jump to re-assigning loop
+ MOV x7,x16 //Loads wd
+ MOV x0,x15 //Loads *pu1_src
+ SUB x7,x7,x6
+ ADD x0,x0,x7
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+ MOV x7,x16 //Loads wd
+ MOV x5,x23 //Loads pu1_avail
+ CMP x6,x7 //col == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+ MOV x20,#-1
+ csel x8, x20, x8,NE //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
+ CMP x6,#16 //if(col == 16)
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB w8,[x5,#2] //pu1_avail[2]
+ CMP x8,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ csel x8, x3, x8,NE
+ SUB x8,x8,#1 //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
+
+ MOV x7,x16 //Loads wd
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
+ ADD x3,x3,#16
+
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ MOV x4,x17 //Loads ht
+
+ SUB x7,x7,x6 //(wd - col)
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ MOV x8,x19 //Loads *pu1_src
+
+ ADD x7,x7,#15 //15 + (wd - col)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+ SUB x5,x5,#1
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRB w8,[x7] //load the value and increment by src_strd
+ ADD x7,x7,x1
+ SUBS x4,x4,#1 //decrement the loop count
+ STRB w8,[x5,#1]! //store it in the stack pointer
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ movi v18.16b, #0
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ ADD x8,x0,x1 //*pu1_src + src_strd
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+ LDRB w5,[x8,#16] //pu1_src_cpy[src_strd + 16]
+ mov v18.8b[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+ CMP x7,x12
+ BLT SIGN_UP_CHANGE_WD_16_HT_4
+ MOV x5,x23 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB w8,[x0] //pu1_src_cpy[0]
+ SUB x5,x12,x7 //ht_tmp - row
+ ADD x5,x14,x5 //pu1_src_left_cpy[ht_tmp - row]
+ SUB x5,x5,#1
+ LDRB w5,[x5] //load the value
+ SUBS x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+// TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW2 v30.8h, v30.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn2 v28.16b, v30.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ MOV x8,x17 //Loads ht
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ MOV x2,x21 //Loads *pu1_src_left
+ SUB x5,x5,#1
+ SUB x2,x2,#1
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRB w7,[x5,#1]! //au1_src_left_tmp[row]
+ STRB w7,[x2,#1]! //pu1_src_left[row] = au1_src_left_tmp[row]
+ SUBS x8,x8,#1
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+ SUBS x6,x6,#16 //Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP //Jump to re-assigning loop
+
+
+WIDTH_RESIDUE:
+ MOV x7,x16 //Loads wd
+ MOV x5,x23 //Loads pu1_avail
+ CMP x6,x7 //wd_residue == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+ LDRB w11,[x5,#2] //pu1_avail[2]
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ CMP x11,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ csel x8, x3, x8,NE
+
+ SUB x8,x8,#1
+
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ LD1 {v10.16b},[x8],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
+ MOV x7,x16 //Loads wd
+
+ MOV x4,x17 //Loads ht
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ SUB x7,x7,#1 //(wd - 1)
+
+ MOV x8,x19 //Loads *pu1_src
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB x5,x5,#1
+
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 1)]
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRB w8,[x7] //load the value and increment by src_strd
+ ADD x7,x7,x1
+ SUBS x4,x4,#1 //decrement the loop count
+ STRB w8,[x5,#1]! //store it in the stack pointer
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+ movi v18.16b, #0
+ ADD x8,x0,x1 //*pu1_src + src_strd
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+
+ LDRB w8,[x8,#16] //pu1_src_cpy[src_strd + 16]
+ mov v18.8b[0], w8 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ EXT v18.16b, v16.16b , v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
+
+ CMP x7,x12
+ BLT SIGN_UP_CHANGE_RESIDUE
+ MOV x5,x23 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB w8,[x0] //pu1_src_cpy[0]
+ SUB x5,x12,x7 //ht_tmp - row
+
+ ADD x5,x14,x5
+ SUB x5,x5,#1
+ LDRB w5,[x5] //load the value
+ SUBS x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
+ mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v14.16b, v14.16b , v14.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
+
+ TBL v24.8b, {v7.16b},v26.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SUBS x7,x7,#1
+ BNE PU1_SRC_LOOP_RESIDUE
+
+ MOV x8,x17 //Loads ht
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+
+ MOV x2,x21 //Loads *pu1_src_left
+ SUB x5,x5,#1
+
+ SUB x2,x2,#1
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDRB w7,[x5,#1]! //au1_src_left_tmp[row]
+ SUBS x8,x8,#1
+ STRB w7,[x2,#1]! //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ MOV x8,x17 //Loads ht
+ MOV x7,x16 //Loads wd
+
+ MOV x0,x19 //Loads *pu1_src
+ SUB x8,x8,#1 //ht - 1
+
+ madd x6, x8, x1, x7 //wd - 1 + (ht - 1) * src_strd
+ STRB w9,[x0] //pu1_src_org[0] = u1_pos_0_0_tmp
+
+ MOV x4,x24 //Loads pu1_src_top_left
+ ADD x6,x0,x6 //pu1_src[wd - 1 + (ht - 1) * src_strd]
+
+ ADD x12,sp,#0x02
+ SUB x6,x6,#1
+ STRB w10,[x6] //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+ ADD x6,x6,#1
+
+ LDRB w11,[sp] //load u1_src_top_left_tmp from stack pointer
+ MOV x3,x22 //Loads pu1_src_top
+
+ STRB w11,[x4] //*pu1_src_top_left = u1_src_top_left_tmp
+
+SRC_TOP_LOOP:
+ LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS x7,x7,#8 //Decrement the width
+ ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0xA0
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x23, x24,[sp],#16
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
new file mode 100644
index 0000000..2fa7c22
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@@ -0,0 +1,1120 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class2_chroma.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset_u,
+// WORD8 *pi1_sao_offset_v,
+// WORD32 wd,
+// WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x5 => *pu1_avail
+//x6 => *pi1_sao_offset_u
+//x9 => *pi1_sao_offset_v
+//x7 => wd
+//x8=> ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class2_chroma_av8
+
+ihevc_sao_edge_offset_class2_chroma_av8:
+
+
+ // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments
+
+ ldr x8,[sp,#0]
+ ldr x9,[sp,#8]
+ ldr w10,[sp,#16]
+ ldr w11,[sp,#24]
+ push_v_regs
+
+
+ // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+ stp x25, x26,[sp,#-16]!
+ stp x27, x28,[sp,#-16]!
+
+ mov x15,x4 // *pu1_src_top_left 0x28
+ //mov x16,x5 // *pu1_src_top_right 0x2c
+ mov x17,x6 // *pu1_src_bot_left 0x30
+ mov x21,x7 // *pu1_avail 0x34
+ mov x22,x8 // *pi1_sao_offset_u 0x38
+ mov x23,x9 // *pi1_sao_offset_v 0x3c
+ mov x24,x10 // wd 0x40
+ mov x25,x11 // ht 0x44
+
+
+ mov w7, w24 //Loads wd
+ mov w8, w25 //Loads ht
+ SUB x9,x7,#2 //wd - 2
+
+ mov x4, x15 //Loads pu1_src_top_left
+ LDRH w10,[x3,x9] //pu1_src_top[wd - 2]
+
+ mov x26, x0 //Store pu1_src in sp
+ MOV x9,x7 //Move width to x9 for loop count
+
+ mov x17, x2 //Store pu1_src_bot_left in sp
+ mov x5, x21 //Loads pu1_avail
+ mov x6, x22 //Loads pi1_sao_offset_u
+
+ mov x22, x3 //Store pu1_src_top in sp
+ SUB sp,sp,#0xE0 //Decrement the stack pointer to store some temp arr values
+
+ STRH w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 2]
+ SUB x10,x8,#1 //ht-1
+ madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col]
+ ADD x12,sp,#10 //temp array
+
+AU1_SRC_TOP_LOOP:
+ LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col]
+ SUBS x9,x9,#8 //Decrement the loop count by 8
+ ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_4_LOOP_U:
+ LDRB w9,[x5,#4] //pu1_avail[4]
+ CMP x9,#0
+ LDRB w9,[x0] //u1_pos_0_0_tmp_u = pu1_src[0]
+ LDRB w10,[x0,#1] //u1_pos_0_0_tmp_v = pu1_src[1]
+ BEQ PU1_AVAIL_7_LOOP_U
+
+ LDRB w11,[x4] //pu1_src_top_left[0]
+ ADD x14,x0,x1 //pu1_src + src_strd
+
+ SUB x12,x9,x11 //pu1_src[0] - pu1_src_top_left[0]
+
+ LDRB w14,[x14,#2] //pu1_src[2 + src_strd]
+ CMP x12,#0
+
+ movn x20,#0
+ csel x12, x20, x12,LT
+ SUB x11,x9,x14 //pu1_src[0] - pu1_src[2 + src_strd]
+
+ MOV x20,#1
+ csel x12, x20, x12,GT //SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+
+ ADD x11,x12,x11 //SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[2 + src_strd])
+ ADD x11,x11,#2 //edge_idx
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0 //0 != edge_idx
+ BEQ PU1_AVAIL_4_LOOP_V
+ LDRSB x11,[x6,x12] //pi1_sao_offset_u[edge_idx]
+ ADD x9,x9,x11 //pu1_src[0] + pi1_sao_offset_u[edge_idx]
+ mov x20,#255
+ cmp x9,x20
+ csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_4_LOOP_V:
+
+ LDRB w11,[x4,#1] //pu1_src_top_left[1]
+ ADD x14,x0,x1 //pu1_src + src_strd
+
+ SUB x12,x10,x11 //pu1_src[1] - pu1_src_top_left[1]
+ LDRB w14,[x14,#3] //pu1_src[3 + src_strd]
+
+ CMP x12,#0
+ movn x20,#0
+ csel x12, x20, x12,LT
+ SUB x11,x10,x14 //pu1_src[1] - pu1_src[3 + src_strd]
+ MOV x20,#1
+ csel x12, x20, x12,GT //SIGN(pu1_src[0] - pu1_src_top_left[0])
+
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+
+ ADD x11,x12,x11 //SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[3 + src_strd])
+ ADD x11,x11,#2 //edge_idx
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0 //0 != edge_idx
+ BEQ PU1_AVAIL_7_LOOP_U
+ mov x11, x23 //Loads pi1_sao_offset_v
+ LDRSB x11,[x11,x12] //pi1_sao_offset_v[edge_idx]
+ ADD x10,x10,x11 //pu1_src[0] + pi1_sao_offset_v[edge_idx]
+ mov x20,#255
+ cmp x10,x20
+ csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_U:
+ STRB w10,[sp,#7]
+ STRB w9,[sp,#6]
+
+ LDRB w10,[x5,#7] //pu1_avail[7]
+ CMP x10,#0
+ SUB x10,x7,#2 //wd - 2
+ SUB x11,x8,#1 //ht - 1
+ madd x12, x11, x1, x10 //wd - 2 + (ht - 1) * src_strd
+ ADD x12,x12,x0 //pu1_src[wd - 2 + (ht - 1) * src_strd]
+ LDRB w10,[x12] //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
+ LDRB w9,[x12,#1] //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
+ BEQ PU1_AVAIL_3_LOOP
+
+ SUB x11,x12,x1 //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
+ SUB x11,x11,#2 //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+ LDRB w11,[x11] //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
+ SUB x11,x10,x11 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
+
+ ADD x14,x12,x1 //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
+ ADD x14,x14,#2 //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+ LDRB w14,[x14] //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
+ SUB x14,x10,x14 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ CMP x14,#0
+ movn x20,#0
+ csel x14, x20, x14,LT
+ MOV x20,#1
+ csel x14, x20, x14,GT //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
+
+ ADD x11,x11,x14 //Add 2 sign value
+ ADD x11,x11,#2 //edge_idx
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ LDRSB x14,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x14,#0
+ BEQ PU1_AVAIL_7_LOOP_V
+ LDRSB x11,[x6,x14] //pi1_sao_offset_u[edge_idx]
+ ADD x10,x10,x11 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x10,x20
+ csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_7_LOOP_V:
+ ADD x12,x12,#1
+ SUB x11,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
+ SUB x11,x11,#2 //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+ LDRB w11,[x11] //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
+ SUB x11,x9,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
+
+ ADD x14,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
+ ADD x14,x14,#2 //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ LDRB w14,[x14] //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ SUB x14,x9,x14 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
+ CMP x14,#0
+ movn x20,#0
+ csel x14, x20, x14,LT
+ MOV x20,#1
+ csel x14, x20, x14,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
+
+ ADD x11,x11,x14 //Add 2 sign value
+ ADD x11,x11,#2 //edge_idx
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0
+ BEQ PU1_AVAIL_3_LOOP
+ mov x14, x23 //Loads pi1_sao_offset_v
+ LDRSB x11,[x14,x12] //pi1_sao_offset_v[edge_idx]
+ ADD x9,x9,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x9,x20
+ csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+ STRB w10,[sp,#8]
+ movi v0.16b, #2 //const_2 = vdupq_n_s8(2)
+ STRB w9,[sp,#9]
+
+ MOV x12,x8 //Move ht
+ movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy
+
+ LDRB w11,[x5,#3] //pu1_avail[3]
+ movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ CMP x11,#0
+
+ SUB x20,x12,#1 //ht_tmp--
+ csel x12, x20, x12,EQ
+ LDRB w5,[x5,#2] //pu1_avail[2]
+
+ CMP x5,#0
+
+ ADD x20,x0,x1 //pu1_src += src_strd
+ csel x0, x20, x0,EQ
+ LD1 {v6.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+ SUB x20,x12,#1 //ht_tmp--
+ csel x12, x20, x12,EQ
+
+ mov x6, x23 //Loads pi1_sao_offset_v
+ ADD x20,x14,#2 //pu1_src_left_cpy += 2
+ csel x14, x20, x14,EQ
+
+ mov x27, x0 //Store pu1_src in sp
+ LD1 {v7.8b},[x6] //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+ ADRP x2, :got:gi1_table_edge_idx //table pointer
+ LDR x2, [x2, #:got_lo12:gi1_table_edge_idx]
+
+ MOV x6,x7 //move wd to x6 loop_count
+ movi v8.16b, #0XFF //au1_mask = vdupq_n_s8(-1)
+ CMP x7,#16 //Compare wd with 16
+
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP x8,#4 //Compare ht with 4
+ BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ mov x5, x21 //Loads pu1_avail
+ mov w7, w24 //Loads wd
+ CMP x6,x7 //col == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP x6,#16 //if(col == 16)
+ mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ BNE SKIP_AU1_MASK_VAL
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ LDRB w9,[x5,#2] //pu1_avail[2]
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //SUB x0, x0,#8
+ CMP x9,#0
+
+ mov w4, w25 //Loads ht
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+
+ mov w7, w24 //Loads wd
+ csel x8, x3, x8,NE //pu1_src_top_cpy
+
+ SUB x8,x8,#2 //pu1_src - src_strd - 2
+ ADD x3,x3,#16
+
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ //SUB x8, x8,#8
+ SUB x7,x7,x6 //(wd - col)
+
+ ADD x7,x7,#14 //15 + (wd - col)
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ mov x8, x26 //Loads *pu1_src
+
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+AU1_SRC_LEFT_LOOP:
+ LDRH w8,[x7] //load the value and increment by src_strd
+ SUBS x4,x4,#1 //decrement the loop count
+
+ STRH w8,[x5],#2 //store it in the stack pointer
+ ADD x7,x7,x1
+
+ BNE AU1_SRC_LEFT_LOOP
+
+ ADD x8,x0,x1 //I *pu1_src + src_strd
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+ LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x8, x8,#8
+
+ ADD x8,x8,#16 //I
+ movi v18.16b, #0
+ LDRH w5,[x8] //I pu1_src_cpy[src_strd + 16]
+
+ mov x10, x21 //I Loads pu1_avail
+ mov v18.4h[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ LDRB w10,[x10,#2] //I pu1_avail[2]
+
+ CMP x10,#0 //I
+ EXT v18.16b, v16.16b , v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ BNE SIGN_UP_CHANGE_DONE //I
+
+ LDRB w11,[x0] //I pu1_src_cpy[0]
+ SUB x4,x12,x7 //I ht_tmp - row
+
+ LDRB w10,[x0,#1] //I pu1_src_cpy[0]
+ LSL x4,x4,#1 //I (ht_tmp - row) * 2
+
+ ADD x9,x14,x4 //I pu1_src_left_cpy[(ht_tmp - row) * 2]
+ sub x13,x9,#2
+ LDRB w5,[x13] //I load the value
+
+ SUB x8,x11,x5 //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ sub x13,x9,#1
+ LDRB w5,[x13] //I load the value
+
+ CMP x8,#0 //I
+ SUB x4,x10,x5 //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+
+ movn x20,#0
+ csel x8, x20, x8,LT //I
+ MOV x20,#1
+ csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ CMP x4,#0 //I
+ mov v14.8b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ movn x20,#0
+ csel x4, x20, x4,LT //I
+
+ MOV x20,#1
+ csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ mov v14.8b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+ LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ cmhi v20.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ cmhi v22.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ TBL v18.16b, {v30.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v22.16b //I sign_up = vnegq_s8(sign_down)
+
+ //TBL v19.8b, {v30.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v22.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v23.d[0],v22.d[1]
+
+ Uxtl2 v18.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ UZP1 v31.8b, v22.8b, v23.8b
+ UZP2 v23.8b, v22.8b, v23.8b //I
+ mov v22.8b,v31.8b
+
+ TBL v22.8b, {v6.16b},v22.8b //I
+ TBL v23.8b, {v7.16b},v23.8b //I
+ ZIP1 v31.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b //I
+ mov v22.8b,v31.8b
+
+ mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row
+ SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v18.8h, v18.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ UMIN v18.8h, v18.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+ SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1
+
+
+PU1_SRC_LOOP:
+ ADD x8,x0,x1,LSL #1 //II *pu1_src + src_strd
+ xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0])
+ ADD x11,x8,x1 //III *pu1_src + src_strd
+
+ LD1 {v16.16b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x8, x8,#8
+ LD1 {v30.16b},[x11] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v31.8b},[x11] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x11, x11,#8
+
+ ADD x8,x8,#16 //II
+ xtn2 v20.16b, v18.8h //I vmovn_s16(pi2_tmp_cur_row.val[1])
+ LDRH w5,[x8] //II pu1_src_cpy[src_strd + 16]
+
+ ADD x11,x11,#16 //III
+ mov v28.4h[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ LDRH w4,[x11] //III pu1_src_cpy[src_strd + 16]
+
+ LDRB w8,[x0,x1] //II pu1_src_cpy[0]
+ EXT v28.16b, v16.16b , v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ SUB x5,x12,x7 //II ht_tmp - row
+
+ LSL x5,x5,#1 //II (ht_tmp - row) * 2
+ mov v18.4h[0], w4 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ ADD x9,x14,x5 //II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ sub x13,x9,#2
+ LDRB w11,[x13] //II load the value
+ ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ SUB x8,x8,x11 //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+ CMP x8,#0 //II
+ EXT v18.16b, v30.16b , v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ LDRB w11,[x0,#1] //II pu1_src_cpy[0]
+
+ movn x20,#0
+ csel x8, x20, x8,LT //II
+ cmhi v22.16b, v12.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ MOV x20,#1
+ csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ sub x13,x9,#1
+ LDRB w5,[x13] //II load the value
+ mov v14.8b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1
+
+ SUB x11,x11,x5 //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ cmhi v24.16b, v28.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ CMP x11,#0 //II
+
+ movn x20,#0
+ csel x11, x20, x11,LT //II
+ SUB v24.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x20,#1
+ csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ LDRB w4,[x0,x1] //III pu1_src_cpy[0]
+ LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ SUB x5,x12,x7 //III ht_tmp - row
+
+ ADD x10,x0,x1
+ mov v14.8b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ LSL x5,x5,#1 //III (ht_tmp - row) * 2
+
+ ADD x9,x14,x5 //III pu1_src_left_cpy[(ht_tmp - row) * 2]
+ ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ LDRB w10,[x10,#1] //III pu1_src_cpy[0]
+
+ sub x13,x9,#2
+ LDRB w5,[x13] //III load the value
+ ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+ SUB x4,x4,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+ mov v22.d[1],v22.d[0]
+ CMP x4,#0 //III
+ sub x13,x9,#1
+ LDRB w9,[x13] //III load the value
+ TBL v26.16b, {v22.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+
+ movn x20,#0
+ csel x4, x20, x4,LT //III
+ SUB x10,x10,x9 //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ //TBL v27.8b, {v22.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ MOV x20,#1
+ csel x4, x20, x4,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ CMP x10,#0 //III
+
+ mov v27.d[0],v26.d[1]
+ UZP1 v31.8b, v26.8b, v27.8b
+ UZP2 v27.8b, v26.8b, v27.8b //II
+ mov v26.8b,v31.8b
+ mov v14.8b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+ movn x20,#0
+ csel x10, x20, x10,LT //III
+ MOV x20,#1
+ csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ TBL v24.8b, {v6.16b},v26.8b //II
+ cmhi v20.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ cmhi v22.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ TBL v25.8b, {v7.16b},v27.8b //II
+ SUB v22.16b, v22.16b , v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ mov v14.8b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ ZIP1 v31.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b //II
+ mov v24.8b,v31.8b
+
+ Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+
+ LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+ SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v22.16b //III sign_up = vnegq_s8(sign_down)
+
+ //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ mov v19.d[0],v18.d[1]
+ UZP1 v31.8b, v18.8b, v19.8b
+ UZP2 v19.8b, v18.8b, v19.8b //III
+ mov v18.8b,v31.8b
+ TBL v22.8b, {v6.16b},v18.8b //III
+ SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row
+ TBL v23.8b, {v7.16b},v19.8b //III
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ ZIP1 v31.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b //III
+ mov v22.8b,v31.8b
+ xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+ SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+ SADDW v18.8h, v18.8h , v23.8b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1
+ SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ CMP x7,#1
+
+ ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+ ADD x8,x0,x1,LSL #1 //*pu1_src + src_strd
+ xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ LDRB w11,[x0,x1] //pu1_src_cpy[0]
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x8, x8,#8
+ SUB x4,x12,x7 //ht_tmp - row
+
+ ADD x8,x8,#16
+ xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1])
+ LDRH w5,[x8] //pu1_src_cpy[src_strd + 16]
+
+ LSL x4,x4,#1 //(ht_tmp - row) * 2
+ mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ ADD x9,x14,x4 //pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ sub x13,x9,#2
+ LDRB w5,[x13] //load the value
+ EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+ SUB x8,x11,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+
+ CMP x8,#0
+ ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ movn x20,#0
+ csel x8, x20, x8,LT
+
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+
+ LDRB w11,[x0,#1] //pu1_src_cpy[0]
+ mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+ sub x13,x9,#1
+ LDRB w5,[x13] //load the value
+
+ SUB x4,x11,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ CMP x4,#0
+
+ movn x20,#0
+ csel x4, x20, x4,LT
+ cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ MOV x20,#1
+ csel x4, x20, x4,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+
+ mov v14.8b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+ SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ mov v30.d[1],v30.d[0]
+ TBL v26.16b, {v30.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ //TBL v27.8b, {v30.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v27.d[0],v26.d[1]
+
+ Uxtl2 v18.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ UZP1 v31.8b, v26.8b, v27.8b
+ UZP2 v27.8b, v26.8b, v27.8b
+ mov v26.8b,v31.8b
+
+ TBL v24.8b, {v6.16b},v26.8b
+ TBL v25.8b, {v7.16b},v27.8b
+ ZIP1 v31.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v31.8b
+
+ SADDW v20.8h, v20.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SADDW v18.8h, v18.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v18.8h, v18.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+ mov w8, w25 //Loads ht
+ xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+
+ mov x11, x17 //Loads *pu1_src_left
+ xtn2 v20.16b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+
+SRC_LEFT_LOOP:
+ LDR w7, [x5],#4 //au1_src_left_tmp[row]
+ SUBS x8,x8,#2
+ STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUBS x6,x6,#16 //Decrement the wd loop count by 16
+ ST1 { v20.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ CMP x6,#8 //Check whether residue remains
+
+ BLT RE_ASSINING_LOOP //Jump to re-assigning loop
+ mov w7, w24 //Loads wd
+ mov x0, x27 //Loads *pu1_src
+ SUB x7,x7,x6
+ ADD x0,x0,x7
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+
+
+WD_16_HT_4_LOOP:
+ mov x5, x21 //Loads pu1_avail
+ mov w7, w24 //Loads wd
+ CMP x6,x7 //col == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP x6,#16 //if(col == 16)
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB w8,[x5,#2] //pu1_avail[2]
+ CMP x8,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ csel x8, x3, x8,NE //pu1_src_top_cpy
+ SUB x8,x8,#2 //pu1_src - src_strd - 2
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
+ //SUB x8, x8,#8
+
+ ADD x3,x3,#16
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+ mov w4, w25 //Loads ht
+ mov x7, x24 //Loads wd
+ SUB x7,x7,x6 //(wd - col)
+ ADD x7,x7,#14 //15 + (wd - col)
+ mov x8, x26 //Loads *pu1_src
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRH w8,[x7] //load the value and increment by src_strd
+ STRH w8,[x5],#2 //store it in the stack pointer
+ ADD x7,x7,x1
+
+ SUBS x4,x4,#1 //decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //SUB x0, x0,#8
+
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ movi v18.16b, #0
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ movi v18.16b, #0
+ ADD x8,x0,x1 //*pu1_src + src_strd
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x8, x8,#8
+
+ ADD x8,x8,#16
+ LDRH w5,[x8] //pu1_src_cpy[src_strd + 16]
+ mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+ CMP x7,x12
+ BLT SIGN_UP_CHANGE_WD_16_HT_4
+ mov x5, x21 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB w8,[x0] //pu1_src_cpy[0]
+ SUB x5,x12,x7 //ht_tmp - row
+ LSL x5,x5,#1 //(ht_tmp - row) * 2
+ ADD x9,x14,x5 //pu1_src_left_cpy[(ht_tmp - row) * 2]
+ sub x13,x9,#2
+ LDRB w5,[x13] //load the value
+ SUB x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ CMP x8,#0
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ mov v14.8b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+ LDRB w8,[x0,#1] //pu1_src_cpy[0]
+ sub x13,x9,#1
+ LDRB w5,[x13] //load the value
+ SUB x8,x8,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
+ CMP x8,#0
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ mov v14.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v27.d[0],v26.d[1]
+
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v14.16b, v14.16b , v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ UZP1 v31.8b, v26.8b, v27.8b
+ UZP2 v27.8b, v26.8b, v27.8b
+ mov v26.8b,v31.8b
+ TBL v24.8b, {v6.16b},v26.8b
+ TBL v25.8b, {v7.16b},v27.8b
+ ZIP1 v31.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v31.8b
+
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ Uxtl2 v26.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW v26.8h, v26.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn2 v28.16b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ mov w8, w25 //Loads ht
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+ mov x11, x17 //Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDR w7, [x5],#4 //au1_src_left_tmp[row]
+ STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+
+ SUBS x8,x8,#2
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+
+ SUBS x6,x6,#16 //Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP //Jump to re-assigning loop
+ BGT WD_16_HT_4_LOOP
+
+
+WIDTH_RESIDUE:
+ mov w7, w24 //Loads wd
+ mov x5, x21 //Loads pu1_avail
+ CMP x6,x7 //wd_residue == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.8b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+ LDRB w8,[x5,#2] //pu1_avail[2]
+ CMP x8,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ csel x8, x3, x8,NE
+ SUB x8,x8,#2 //pu1_src - src_strd - 2
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+ //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
+ //SUB x8, x8,#8
+
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+ mov w4, w25 //Loads ht
+ mov w7, w24 //Loads wd
+ mov x8, x26 //Loads *pu1_src
+ SUB x7,x7,#2 //(wd - 2)
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRH w8,[x7] //load the value and increment by src_strd
+ STRH w8,[x5],#2 //store it in the stack pointer
+ ADD x7,x7,x1
+ SUBS x4,x4,#1 //decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //SUB x0, x0,#8
+
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+ movi v18.16b, #0
+ ADD x8,x0,x1 //*pu1_src + src_strd
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x8, x8,#8
+
+ ADD x8,x8,#16
+ LDRH w5,[x8] //pu1_src_cpy[src_strd + 16]
+ mov v18.4h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
+ EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
+
+ CMP x7,x12
+ BLT SIGN_UP_CHANGE_RESIDUE
+ mov x5, x21 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB w8,[x0] //pu1_src_cpy[0]
+ SUB x5,x12,x7 //ht_tmp - row
+ LSL x5,x5,#1 //(ht_tmp - row) * 2
+ ADD x9,x14,x5 //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ sub x13,x9,#2
+ LDRB w5,[x13] //load the value
+ SUB x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ CMP x8,#0
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ mov v14.8b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
+
+ LDRB w8,[x0,#1] //pu1_src_cpy[0]
+ sub x13,x9,#1
+ LDRB w5,[x13] //load the value
+ SUB x8,x8,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
+ CMP x8,#0
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
+ mov v14.8b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ mov v22.d[1],v22.d[0]
+ TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v27.d[0],v26.d[1]
+
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v14.16b, v14.16b , v14.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ UZP1 v31.8b, v26.8b, v27.8b
+ UZP2 v27.8b, v26.8b, v27.8b
+ mov v26.8b,v31.8b
+ TBL v24.8b, {v6.16b},v26.8b
+ TBL v25.8b, {v7.16b},v27.8b
+ ZIP1 v31.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v31.8b
+
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ ST1 {v28.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP
+
+ mov w8, w25 //Loads ht
+ mov x11, x17 //Loads *pu1_src_left
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDR w7, [x5],#4 //au1_src_left_tmp[row]
+ SUBS x8,x8,#2
+ STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ mov w8, w25 //Loads ht
+
+ mov x0, x26 //Loads *pu1_src
+ SUB x8,x8,#1 //ht - 1
+
+ mov w7, w24 //Loads wd
+
+ LDRH w9,[sp,#6]
+ madd x6, x8, x1, x7 //wd - 2 + (ht - 1) * src_strd
+
+ STRH w9,[x0] //pu1_src_org[0] = u1_pos_0_0_tmp
+ ADD x6,x0,x6 //pu1_src[wd - 2 + (ht - 1) * src_strd]
+
+ LDRH w9,[sp,#8]
+ ADD x12,sp,#10
+ sub x13,x6,#2
+ STRH w9,[x13] //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+ mov x4, x15 //Loads pu1_src_top_left
+ LDRH w10,[sp] //load u1_src_top_left_tmp from stack pointer
+ STRH w10,[x4] //*pu1_src_top_left = u1_src_top_left_tmp
+ mov x3, x22 //Loads pu1_src_top
+
+SRC_TOP_LOOP:
+ LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS x7,x7,#8 //Decrement the width
+ ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0xE0
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x27, x28,[sp],#16
+ ldp x25, x26,[sp],#16
+ ldp x23, x24,[sp],#16
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class3.s b/common/arm64/ihevc_sao_edge_offset_class3.s
new file mode 100644
index 0000000..6c47abe
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@@ -0,0 +1,887 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class3.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset,
+// WORD32 wd,
+// WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x5 => *pu1_avail
+//x6 => *pi1_sao_offset
+//x7 => wd
+//x8=> ht
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_av8
+
+ihevc_sao_edge_offset_class3_av8:
+
+
+ // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+
+ MOV x19,x0 //Store pu1_src in sp
+ MOV x21,x6 //Store pu1_src_left in sp
+ MOV x22,x3 //Store pu1_src_top in sp
+ MOV x23,x7 //Store pu1_avail in sp
+ MOV x24,x4 //Store pu1_src_top_left in sp
+ MOV x20,x5 //Store pu1_src_top_right in sp
+ MOV x13,x6 //Store pu1_src_bot_left in sp
+
+ MOV x5,x7 //Loads pu1_avail
+
+ LDR x6,[sp,#112] //Loads pi1_sao_offset
+ LDR w7,[sp,#120] //Loads wd
+ LDR w8,[sp,#128] //Loads ht
+
+ MOV x16,x7 // wd
+ MOV x17,x8 // ht
+
+ SUB x9,x7,#1 //wd - 1
+
+ LDRB w10,[x3,x9] //pu1_src_top[wd - 1]
+
+ MOV x9,x7 //Move width to x9 for loop count
+
+ SUB sp,sp,#0xA0 //Decrement the stack pointer to store some temp arr values
+
+ STRB w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 1]
+ SUB x10,x8,#1 //ht-1
+ madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col]
+ ADD x12,sp,#0x02 //temp array
+
+AU1_SRC_TOP_LOOP:
+ LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col]
+ SUBS x9,x9,#8 //Decrement the loop count by 8
+ ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP:
+ LDRB w9,[x5,#5] //pu1_avail[5]
+ CMP x9,#0
+ SUB x10,x7,#1 //[wd - 1]
+ LDRB w9,[x0,x10] //u1_pos_0_0_tmp = pu1_src[wd - 1]
+ BEQ PU1_AVAIL_6_LOOP
+
+ MOV x11,x20 //Load pu1_src_top_right from sp
+ SUB x10,x10,#1 //[wd - 1 - 1]
+
+ LDRB w11,[x11] //pu1_src_top_right[0]
+ SUB x12,x9,x11 //pu1_src[wd - 1] - pu1_src_top_right[0]
+
+ ADD x11,x0,x1 //pu1_src + src_strd
+
+ LDRB w14,[x11,x10] //pu1_src[wd - 1 - 1 + src_strd]
+ CMP x12,#0
+ movn x20,#0
+ csel x12, x20, x12,LT
+ SUB x11,x9,x14 //pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
+
+ MOV x20,#1
+ csel x12, x20, x12,GT //SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ ADD x11,x12,x11 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
+ ADD x11,x11,#2 //edge_idx
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0 //0 != edge_idx
+ BEQ PU1_AVAIL_6_LOOP
+ LDRSB x10,[x6,x12] //pi1_sao_offset[edge_idx]
+ ADD x9,x9,x10 //pu1_src[0] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x9,x20
+ csel x9, x20, x9, ge //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP:
+ LDRB w10,[x5,#6] //pu1_avail[6]
+ SUB x11,x8,#1 //ht - 1
+
+ CMP x10,#0
+ madd x12, x11, x1, x0 //pu1_src[(ht - 1) * src_strd]
+
+ LDRB w10,[x12] //u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
+ BEQ PU1_AVAIL_3_LOOP
+
+ MOV x14,x13 //Load pu1_src_bot_left from sp
+ SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd) - src_strd]
+
+ LDRB w14,[x14] //Load pu1_src_bot_left[0]
+ ADD x11,x11,#1 //pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+
+ LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+ SUB x14,x10,x14 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+
+ SUB x11,x10,x11 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
+
+ CMP x14,#0
+ movn x20,#0
+ csel x14, x20, x14,LT
+ MOV x20,#1
+ csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+ ADD x11,x11,x14 //Add 2 sign value
+
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+ ADD x11,x11,#2 //edge_idx
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0
+ BEQ PU1_AVAIL_3_LOOP
+ LDRSB x11,[x6,x12] //pi1_sao_offset[edge_idx]
+ ADD x10,x10,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x10,x20
+ csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+ MOV x21,x2
+ MOV x12,x8 //Move ht
+
+ MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy
+ movi v0.16b, #2 //const_2 = vdupq_n_s8(2)
+ LDRB w11,[x5,#3] //pu1_avail[3]
+
+ CMP x11,#0
+ movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ SUB x20,x12,#1 //ht_tmp--
+ csel x12, x20, x12,EQ
+
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ CMP x5,#0
+
+ ADD x20,x0,x1 //pu1_src += src_strd
+ csel x0, x20, x0,EQ
+ LD1 {v7.8b},[x6] //offset_tbl = vld1_s8(pi1_sao_offset)
+ SUB x20,x12,#1 //ht_tmp--
+ csel x12, x20, x12,EQ
+
+ ADRP x6, :got:gi1_table_edge_idx //table pointer
+ LDR x6, [x6, #:got_lo12:gi1_table_edge_idx]
+
+ movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ ADD x20,x14,#1 //pu1_src_left_cpy += 1
+ csel x14, x20, x14,EQ
+
+ MOV x15,x0 //Store pu1_src in sp
+ LD1 {v6.8b},[x6] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOV x6,x7 //move wd to x6 loop_count
+
+ CMP x7,#16 //Compare wd with 16
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP x8,#4 //Compare ht with 4
+ BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ MOV x7,x16 //Loads wd
+
+ MOV x5,x23 //Loads pu1_avail
+ CMP x6,x7 //col == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP x6,#16 //if(col == 16)
+ BNE SKIP_AU1_MASK_VAL
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ LDRB w8,[x5,#2] //pu1_avail[2]
+ CMP x8,#0
+
+ MOV x4,x17 //Loads ht
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+
+ csel x8, x3, x8,NE
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+
+ MOV x7,x16 //Loads wd
+ ADD x8,x8,#1 //pu1_src - src_strd + 1
+
+ SUB x7,x7,x6 //(wd - col)
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+ ADD x3,x3,#16
+
+ MOV x8,x19 //Loads *pu1_src
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ ADD x7,x7,#15 //15 + (wd - col)
+
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ SUB x5,x5,#1
+
+AU1_SRC_LEFT_LOOP:
+ LDRB w8,[x7] //load the value and increment by src_strd
+ ADD x7,x7,x1
+ SUBS x4,x4,#1 //decrement the loop count
+ STRB w8,[x5,#1]! //store it in the stack pointer
+ BNE AU1_SRC_LEFT_LOOP
+
+ movi v18.16b, #0
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+
+ ADD x8,x0,x1 //I *pu1_src + src_strd
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+ SUB x5,x12,x7 //I ht_tmp - row
+ LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ ADD x8,x14,x5 //I pu1_src_left_cpy[ht_tmp - row]
+
+ ADD x8,x8,#1 //I pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB w8,[x8]
+
+ MOV x5,x23 //I Loads pu1_avail
+ mov v18.16b[15], w8 //I vsetq_lane_u8
+ LDRB w5,[x5,#2] //I pu1_avail[2]
+
+ EXT v18.16b, v18.16b , v16.16b,#15 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+ CMP x5,#0 //I
+ BNE SIGN_UP_CHANGE_DONE //I
+
+SIGN_UP_CHANGE:
+ LDRB w8,[x0,#15] //I pu1_src_cpy[15]
+ SUB x5,x0,x1 //I pu1_src_cpy[16 - src_strd]
+
+ LDRB w5,[x5,#16] //I load the value
+ SUB x8,x8,x5 //I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ CMP x8,#0 //I
+ movn x20,#0
+ csel x8, x20, x8,LT //I
+ MOV x20,#1
+ csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ mov v14.16b[15], w8 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE:
+ cmhi v10.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v18.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v10.16b, v18.16b , v10.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v18.16b , v10.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v18.16b, {v6.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v10.16b //I sign_up = vnegq_s8(sign_down)
+
+ EXT v14.16b, v14.16b , v14.16b,#1 //I sign_up = vextq_s8(sign_up, sign_up, 1)
+// TBL v19.8b, {v6.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ TBL v10.16b, {v7.16b},v18.16b //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ Uxtl2 v22.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW v20.8h, v20.8h , v10.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+// TBL v11.8b, {v7.16b},v19.8b //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ mov v12.16b, v16.16b
+ SADDW2 v22.8h, v22.8h , v10.16b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SMAX v22.8h, v22.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v22.8h, v22.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1
+
+PU1_SRC_LOOP:
+ ADD x8,x0,x1,LSL #1 //II *pu1_src + src_strd
+ xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUB x5,x12,x7 //II ht_tmp - row
+
+ ADD x4,x0,x1 //II pu1_src_cpy[16 - src_strd]
+ xtn2 v20.16b, v22.8h //I vmovn_s16(pi2_tmp_cur_row.val[1])
+ ADD x2,x8,x1 //III *pu1_src + src_strd
+
+ LDRB w11,[x4,#15] //II pu1_src_cpy[15]
+ LD1 {v16.16b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1
+
+ ADD x8,x14,x5 //II pu1_src_left_cpy[ht_tmp - row]
+ LD1 {v30.16b},[x2] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ LDRB w8,[x8,#1]
+
+ LDRB w4,[x0,#16] //II load the value
+ mov v18.16b[15], w8 //II vsetq_lane_u8
+ SUB x11,x11,x4 //II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+ CMP x11,#0 //II
+ ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ SUB x5,x12,x7 //III ht_tmp - row
+
+ movn x20,#0
+ csel x11, x20, x11,LT //II
+ EXT v18.16b, v18.16b , v16.16b,#15 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+ MOV x20,#1
+ csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+ ADD x8,x14,x5 //III pu1_src_left_cpy[ht_tmp - row]
+ mov v14.8b[15], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ CMP x7,#1 //III
+
+ BNE NEXT_ROW_ELSE_2 //III
+ MOV x5,x23 //III Loads pu1_avail
+ LDRB w5,[x5,#3] //III pu1_avail[3]
+ CMP x5,#0 //III
+ SUB x20,x2,#2 //III pu1_src_cpy[src_strd - 1]
+ csel x8, x20, x8,NE
+
+NEXT_ROW_ELSE_2:
+ LDRB w8,[x8,#1] //III
+ cmhi v24.16b, v12.16b , v18.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD x5,x0,x1
+
+ LDRB w2,[x5,#15] //III pu1_src_cpy[15]
+ cmhi v26.16b, v18.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ LDRB w5,[x0,#16] //III load the value
+
+ SUB x2,x2,x5 //III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ SUB v24.16b, v26.16b , v24.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ CMP x2,#0 //III
+
+ movn x20,#0
+ csel x2, x20, x2,LT //III
+ mov v18.16b[15], w8 //III vsetq_lane_u8
+ MOV x20,#1
+ csel x2, x20, x2,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+
+ SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1
+ ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+
+ NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+ EXT v18.16b, v18.16b , v30.16b,#15 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+ ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ EXT v14.16b, v14.16b , v14.16b,#1 //II sign_up = vextq_s8(sign_up, sign_up, 1)
+ TBL v26.16b, {v6.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ cmhi v10.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ mov v14.16b[15], w2 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+// TBL v27.8b, {v6.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ cmhi v18.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ SUB v10.16b, v18.16b , v10.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ TBL v24.16b, {v7.16b},v26.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+
+ ADD v18.16b, v18.16b , v10.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+// TBL v25.8b, {v7.16b},v27.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ NEG v14.16b, v10.16b //III sign_up = vnegq_s8(sign_down)
+
+ SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ TBL v18.16b, {v6.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ EXT v14.16b, v14.16b , v14.16b,#1 //III sign_up = vextq_s8(sign_up, sign_up, 1)
+// TBL v19.8b, {v6.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ SADDW2 v26.8h, v26.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ TBL v10.16b, {v7.16b},v18.16b //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ SADDW v20.8h, v20.8h , v10.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+// TBL v11.8b, {v7.16b},v19.8b //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ Uxtl2 v22.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+ SADDW2 v22.8h, v22.8h , v10.16b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+ SMAX v22.8h, v22.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ mov v12.16b, v30.16b //II pu1_cur_row = pu1_next_row
+ UMIN v22.8h, v22.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ CMP x7,#1 //III
+ ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+ ADD x8,x0,x1,LSL #1 //*pu1_src + src_strd
+ xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0])
+ MOV x5,x23 //Loads pu1_avail
+
+ LDRB w5,[x5,#3] //pu1_avail[3]
+ xtn2 v20.16b, v22.8h //III vmovn_s16(pi2_tmp_cur_row.val[1])
+ CMP x5,#0
+
+ ADD x4,x0,x1 //pu1_src_cpy[16 - src_strd]
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ LDRB w5,[x0,#16] //load the value
+
+ BEQ NEXT_ROW_ELSE_3
+ SUB x8,x8,#1
+ LDRB w8,[x8] //pu1_src_cpy[src_strd - 1]
+ B NEXT_ROW_POINTER_ASSIGNED_3
+NEXT_ROW_ELSE_3:
+ SUB x11,x12,x7 //ht_tmp - row
+ ADD x8,x14,x11 //pu1_src_left_cpy[ht_tmp - row]
+ ADD x8,x8,#1 //pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB w8,[x8]
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+ LDRB w11,[x4,#15] //pu1_src_cpy[15]
+ mov v18.16b[15], w8 //vsetq_lane_u8
+ SUB x8,x11,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+
+ CMP x8,#0
+ EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+ movn x20,#0
+ csel x8, x20, x8,LT
+
+ ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ cmhi v24.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ cmhi v26.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+ SUB v24.16b, v26.16b , v24.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+
+ Uxtl2 v22.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+
+ SADDW v20.8h, v20.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+// TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SADDW2 v22.8h, v22.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v22.8h, v22.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v22.8h, v22.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+INNER_LOOP_DONE:
+ xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ MOV x8,x17 //Loads ht
+
+ xtn2 v20.16b, v22.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+
+ ST1 { v20.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ MOV x2,x21 //Loads *pu1_src_left
+SRC_LEFT_LOOP:
+ LDR w7,[x5],#4 //au1_src_left_tmp[row]
+ SUBS x8,x8,#4
+ STR w7,[x2],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUBS x6,x6,#16 //Decrement the wd loop count by 16
+ CMP x6,#8 //Check whether residue remains
+ BLT RE_ASSINING_LOOP //Jump to re-assigning loop
+ MOV x7,x16 //Loads wd
+ MOV x0,x15 //Loads *pu1_src
+ SUB x7,x7,x6
+ ADD x0,x0,x7
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+
+
+
+WD_16_HT_4_LOOP:
+ MOV x5,x23 //Loads pu1_avail
+ MOV x7,x16 //Loads wd
+ CMP x6,x7 //col == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP x6,#16 //if(col == 16)
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB w8,[x5,#2] //pu1_avail[2]
+ CMP x8,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ csel x8, x3, x8,NE
+ ADD x8,x8,#1 //pu1_src - src_strd + 1
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+
+ ADD x3,x3,#16
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ MOV x4,x17 //Loads ht
+ MOV x7,x16 //Loads wd
+ SUB x7,x7,x6 //(wd - col)
+ ADD x7,x7,#15 //15 + (wd - col)
+ MOV x8,x19 //Loads *pu1_src
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+ SUB x5,x5,#1
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRB w8,[x7] //load the value and increment by src_strd
+ ADD x7,x7,x1
+ STRB w8,[x5,#1]! //store it in the stack pointer
+ SUBS x4,x4,#1 //decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ movi v18.16b, #0
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ ADD x8,x0,x1 //*pu1_src + src_strd
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ MOV x5,x23 //Loads pu1_avail
+ LDRB w5,[x5,#3] //pu1_avail[3]
+ CMP x5,#0
+ BEQ NEXT_ROW_ELSE_WD_16_HT_4
+ CMP x7,#1
+ SUB x8,x8,#1
+ LDRb w20, [x8] //pu1_src_cpy[src_strd - 1]
+ csel w8,w20,w8,EQ
+ BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+NEXT_ROW_ELSE_WD_16_HT_4:
+ SUB x5,x12,x7 //ht_tmp - row
+ ADD x8,x14,x5 //pu1_src_left_cpy[ht_tmp - row]
+ ADD x8,x8,#1 //pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB w8,[x8]
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+ mov v18.16b[15], w8 //vsetq_lane_u8
+ EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+ CMP x7,x12
+ BNE SIGN_UP_CHANGE_WD_16_HT_4
+ MOV x5,x23 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB w8,[x0,#15] //pu1_src_cpy[15]
+ ADD x5,x0,#16 //pu1_src_cpy[16]
+ SUB x5,x5,x1 //pu1_src_cpy[16 - src_strd]
+ LDRB w5,[x5] //load the value
+ SUB x8,x8,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ CMP x8,#0
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v14.16b, v14.16b , v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+
+ TBL v24.16b, {v7.16b},v26.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+// TBL v25.8b, {v7.16b},v27.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
+ Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW2 v30.8h, v30.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn2 v28.16b, v30.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
+ BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ MOV x8,x17 //Loads ht
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ MOV x2,x21 //Loads *pu1_src_left
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDR w7,[x5],#4 //au1_src_left_tmp[row]
+ STR w7,[x2],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+ SUBS x8,x8,#4
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+ SUBS x6,x6,#16 //Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP //Jump to re-assigning loop
+ BGT WD_16_HT_4_LOOP //If not equal jump to width_loop
+
+
+WIDTH_RESIDUE:
+ MOV x7,x16 //Loads wd
+ MOV x5,x23 //Loads pu1_avail
+ CMP x6,x7 //wd_residue == wd
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.8b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+PU1_AVAIL_2_RESIDUE:
+ LDRB w8,[x5,#2] //pu1_avail[2]
+ CMP x8,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ csel x8, x3, x8,NE
+ ADD x8,x8,#1 //pu1_src - src_strd + 1
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
+
+
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+ MOV x4,x17 //Loads ht
+ MOV x7,x16 //Loads wd
+ MOV x8,x19 //Loads *pu1_src
+ SUB x7,x7,#1 //(wd - 1)
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 1)]
+ SUB x5,x5,#1
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRB w8,[x7] //load the value and increment by src_strd
+ ADD x7,x7,x1
+ STRB w8,[x5,#1]! //store it in the stack pointer
+ SUBS x4,x4,#1 //decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+ movi v18.16b, #0
+ ADD x8,x0,x1 //*pu1_src + src_strd
+ LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ MOV x5,x23 //Loads pu1_avail
+ LDRB w5,[x5,#3] //pu1_avail[3]
+ CMP x5,#0
+ BEQ NEXT_ROW_ELSE_RESIDUE
+ CMP x7,#1
+ SUB x8,x8,#1
+ LDRb w20, [x8] //pu1_src_cpy[src_strd - 1]
+ csel w8,w20,w8,EQ
+ BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+NEXT_ROW_ELSE_RESIDUE:
+ SUB x5,x12,x7 //ht_tmp - row
+ ADD x8,x14,x5 //pu1_src_left_cpy[ht_tmp - row]
+ ADD x8,x8,#1 //pu1_src_left_cpy[ht_tmp - row + 1]
+ LDRB w8,[x8]
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+ mov v18.16b[15], w8 //vsetq_lane_u8
+ EXT v18.16b, v18.16b , v16.16b,#15 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
+
+ CMP x7,x12
+ BNE SIGN_UP_CHANGE_RESIDUE
+ MOV x5,x23 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB w8,[x0,#15] //pu1_src_cpy[15]
+ ADD x5,x0,#16 //pu1_src_cpy[16]
+ SUB x5,x5,x1 //pu1_src_cpy[16 - src_strd]
+ LDRB w5,[x5] //load the value
+ SUB x8,x8,x5 //pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
+ CMP x8,#0
+ movn x20,#0
+ csel x8, x20, x8,LT
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
+ mov v14.16b[15], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v26.16b, {v6.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+// TBL v27.8b, {v6.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ EXT v14.16b, v14.16b , v14.16b,#1 //sign_up = vextq_s8(sign_up, sign_up, 1)
+
+ TBL v24.8b, {v7.16b},v26.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SUBS x7,x7,#1
+ BNE PU1_SRC_LOOP_RESIDUE
+
+ MOV x8,x17 //Loads ht
+ MOV x2,x21 //Loads *pu1_src_left
+ ADD x5,sp,#0x42 //*au1_src_left_tmp
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDR w7,[x5],#4 //au1_src_left_tmp[row]
+ SUBS x8,x8,#4
+ STR w7,[x2],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ MOV x7,x16 //Loads wd
+ MOV x0,x19 //Loads *pu1_src
+
+ MOV x11,x17 //Loads ht
+ ADD x8,x0,x7 //pu1_src[wd]
+
+ MOV x4,x24 //Loads pu1_src_top_left
+ SUB x11,x11,#1 //ht - 1
+
+ SUB x8,x8,#1
+ STRB w9,[x8] //pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
+ ADD x8,x8,#1
+ madd x6, x11, x1, x0 //pu1_src_org[(ht - 1) * src_strd]
+
+ LDRB w8,[sp] //load u1_src_top_left_tmp from stack pointer
+ ADD x12,sp,#0x02
+
+ STRB w10,[x6] //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
+ STRB w8,[x4] //*pu1_src_top_left = u1_src_top_left_tmp
+ MOV x3,x22 //Loads pu1_src_top
+
+SRC_TOP_LOOP:
+ LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS x7,x7,#8 //Decrement the width
+ ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0xA0
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x23, x24,[sp], #16
+ ldp x21, x22,[sp], #16
+ ldp x19, x20,[sp], #16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
new file mode 100644
index 0000000..cf25102
--- /dev/null
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@@ -0,0 +1,1155 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* ,:file
+//* ihevc_sao_edge_offset_class3_chroma.s
+//*
+//* ,:brief
+//* Contains function definitions for inter prediction interpolation.
+//* Functions are coded using NEON intrinsics and can be compiled using@ ARM
+//* RVCT
+//*
+//* ,:author
+//* Parthiban V
+//*
+//* ,:par List of Functions:
+//*
+//*
+//* ,:remarks
+//* None
+//*
+//*******************************************************************************
+//*/
+//void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
+// WORD32 src_strd,
+// UWORD8 *pu1_src_left,
+// UWORD8 *pu1_src_top,
+// UWORD8 *pu1_src_top_left,
+// UWORD8 *pu1_src_top_right,
+// UWORD8 *pu1_src_bot_left,
+// UWORD8 *pu1_avail,
+// WORD8 *pi1_sao_offset_u,
+// WORD8 *pi1_sao_offset_v,
+// WORD32 wd,
+// WORD32 ht)
+//**************Variables Vs Registers*****************************************
+//x0 => *pu1_src
+//x1 => src_strd
+//x2 => *pu1_src_left
+//x3 => *pu1_src_top
+//x4 => *pu1_src_top_left
+//x5 => *pu1_avail
+//x6 => *pi1_sao_offset_u
+//x9 => *pi1_sao_offset_v
+//x7 => wd
+//x8=> ht
+
+.text
+.p2align 2
+.include "ihevc_neon_macros.s"
+.globl gi1_table_edge_idx
+.globl ihevc_sao_edge_offset_class3_chroma_av8
+
+ihevc_sao_edge_offset_class3_chroma_av8:
+
+
+ // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments
+
+
+ ldr x8,[sp,#0]
+ ldr x9,[sp,#8]
+ ldr w10,[sp,#16]
+ ldr w11,[sp,#24]
+
+ push_v_regs
+ // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+ stp x25, x26,[sp,#-16]!
+ stp x27, x28,[sp,#-16]!
+
+ mov x15,x4 // *pu1_src_top_left 0x28
+ mov x16,x5 // *pu1_src_top_right 0x2c
+ mov x17,x6 // *pu1_src_bot_left 0x30
+ mov x21,x7 // *pu1_avail 0x34
+ mov x22,x8 // *pi1_sao_offset_u 0x38
+ mov x23,x9 // *pi1_sao_offset_v 0x3c
+ mov x24,x10 // wd 0x40
+ mov x25,x11 // ht 0x44
+
+
+ mov w7, w24 //Loads wd
+ mov w8, w25 //Loads ht
+ SUB x9,x7,#2 //wd - 2
+
+ mov x4, x15 //Loads pu1_src_top_left
+ LDRH w10,[x3,x9] //pu1_src_top[wd - 2]
+
+ MOV x9,x7 //Move width to x9 for loop count
+
+ mov x5, x21 //Loads pu1_avail
+ mov x6, x22 //Loads pi1_sao_offset_u
+
+ mov x22, x3 //Store pu1_src_top in sp
+ SUB sp,sp,#0xE0 //Decrement the stack pointer to store some temp arr values
+
+ STRH w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 2]
+ SUB x10,x8,#1 //ht-1
+ madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col]
+ ADD x12,sp,#10 //temp array
+
+AU1_SRC_TOP_LOOP:
+ LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col]
+ SUBS x9,x9,#8 //Decrement the loop count by 8
+ ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
+ BNE AU1_SRC_TOP_LOOP
+
+PU1_AVAIL_5_LOOP_U:
+ LDRB w9,[x5,#5] //pu1_avail[5]
+ CMP x9,#0
+ SUB x14,x7,#2 //[wd - 2]
+ LDRB w9,[x0,x14] //u1_pos_0_0_tmp_u = pu1_src[wd - 2]
+ SUB x11,x7,#1 //[wd - 1]
+ LDRB w10,[x0,x11] //u1_pos_0_0_tmp_v = pu1_src[wd - 1]
+ BEQ PU1_AVAIL_6_LOOP_U
+
+ mov x11, x16 //Load pu1_src_top_right from sp
+ LDRB w11,[x11] //pu1_src_top_right[0]
+ SUB x12,x9,x11 //pu1_src[wd - 2] - pu1_src_top_right[0]
+ CMP x12,#0
+ movn x20,#0
+ csel x12, x20, x12,LT
+ MOV x20,#1
+ csel x12, x20, x12,GT //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
+ ADD x11,x0,x1 //pu1_src + src_strd
+ SUB x14,x14,#2 //[wd - 2 - 2]
+ LDRB w14,[x11,x14] //pu1_src[wd - 2 - 2 + src_strd]
+ SUB x11,x9,x14 //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+ ADD x11,x12,x11 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
+ ADD x11,x11,#2 //edge_idx
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0 //0 != edge_idx
+ BEQ PU1_AVAIL_5_LOOP_V
+ LDRSB x11,[x6,x12] //pi1_sao_offset_u[edge_idx]
+ ADD x9,x9,x11 //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
+ mov x20,#255
+ cmp x9,x20
+ csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_5_LOOP_V:
+
+ mov x11, x16 //Load pu1_src_top_right from sp
+ LDRB w11,[x11,#1] //pu1_src_top_right[1]
+ SUB x12,x10,x11 //pu1_src[wd - 1] - pu1_src_top_right[1]
+ CMP x12,#0
+ movn x20,#0
+ csel x12, x20, x12,LT
+ MOV x20,#1
+ csel x12, x20, x12,GT //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
+ ADD x11,x0,x1 //pu1_src + src_strd
+ SUB x14,x7,#3 //[wd - 1 - 2]
+ LDRB w14,[x11,x14] //pu1_src[wd - 1 - 2 + src_strd]
+ SUB x11,x10,x14 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+ ADD x11,x12,x11 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
+ ADD x11,x11,#2 //edge_idx
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0 //0 != edge_idx
+ BEQ PU1_AVAIL_6_LOOP_U
+ mov x11, x23 //Loads pi1_sao_offset_v
+ LDRSB x11,[x11,x12] //pi1_sao_offset_v[edge_idx]
+ ADD x10,x10,x11 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
+ mov x20,#255
+ cmp x10,x20
+ csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_U:
+ STRB w9,[sp,#6]
+ STRB w10,[sp,#7]
+ mov x26, x0 //Store pu1_src in sp
+
+ LDRB w10,[x5,#6] //pu1_avail[6]
+ CMP x10,#0
+ SUB x11,x8,#1 //ht - 1
+ madd x12, x11, x1, x0 //pu1_src[(ht - 1) * src_strd]
+ LDRB w10,[x12] //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
+ LDRB w9,[x12,#1] //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
+ BEQ PU1_AVAIL_3_LOOP
+
+ SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd - src_strd]
+ ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ SUB x11,x10,x11 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd])
+
+ mov x14, x17 //Load pu1_src_bot_left from sp
+ LDRB w14,[x14] //Load pu1_src_bot_left[0]
+ SUB x14,x10,x14 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
+ CMP x14,#0
+ movn x20,#0
+ csel x14, x20, x14,LT
+ MOV x20,#1
+ csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
+
+ ADD x11,x11,x14 //Add 2 sign value
+ ADD x11,x11,#2 //edge_idx
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ LDRSB x14,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x14,#0
+ BEQ PU1_AVAIL_6_LOOP_V
+ LDRSB x11,[x6,x14] //pi1_sao_offset_u[edge_idx]
+ ADD x10,x10,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x10,x20
+ csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_6_LOOP_V:
+ ADD x12,x12,#1 //pu1_src[(ht - 1) * src_strd + 1]
+ SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd + 1) - src_strd]
+ ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
+ SUB x11,x9,x11 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
+ CMP x11,#0
+ movn x20,#0
+ csel x11, x20, x11,LT
+ MOV x20,#1
+ csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
+
+ mov x14, x17 //Load pu1_src_bot_left from sp
+ LDRB w14,[x14,#1] //Load pu1_src_bot_left[1]
+ SUB x14,x9,x14 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
+ CMP x14,#0
+ movn x20,#0
+ csel x14, x20, x14,LT
+ MOV x20,#1
+ csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
+
+ ADD x11,x11,x14 //Add 2 sign value
+ ADD x11,x11,#2 //edge_idx
+ ADRP x14, :got:gi1_table_edge_idx //table pointer
+ LDR x14, [x14, #:got_lo12:gi1_table_edge_idx]
+
+ LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx]
+ CMP x12,#0
+ BEQ PU1_AVAIL_3_LOOP
+ mov x14, x23 //Loads pi1_sao_offset_v
+ LDRSB x11,[x14,x12] //pi1_sao_offset_v[edge_idx]
+ ADD x9,x9,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
+ mov x20,#255
+ cmp x9,x20
+ csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
+
+PU1_AVAIL_3_LOOP:
+ STRB w10,[sp,#8]
+ STRB w9,[sp,#9]
+ mov x27, x2 //Store pu1_src_left in sp
+
+ MOV x12,x8 //Move ht
+ MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy
+ LDRB w11,[x5,#3] //pu1_avail[3]
+ CMP x11,#0
+ BNE PU1_AVAIL_2_LOOP
+ SUB x12,x12,#1 //ht_tmp--
+
+PU1_AVAIL_2_LOOP:
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE PU1_AVAIL_2_LOOP_END
+
+ ADD x0,x0,x1 //pu1_src += src_strd
+ SUB x12,x12,#1 //ht_tmp--
+ ADD x14,x14,#2 //pu1_src_left_cpy += 2
+
+PU1_AVAIL_2_LOOP_END:
+ mov x28, x0 //Store pu1_src in sp
+ movi v0.16b, #2 //const_2 = vdupq_n_s8(2)
+ movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0)
+ movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
+ LD1 {v6.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
+ mov x6, x23 //Loads pi1_sao_offset_v
+ LD1 {v7.8b},[x6] //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
+ ADRP x2, :got:gi1_table_edge_idx //table pointer
+ LDR x2, [x2, #:got_lo12:gi1_table_edge_idx]
+
+ //VLD1.8 D6,[x6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ movi v8.16b, #0xFF //au1_mask = vdupq_n_s8(-1)
+ MOV x6,x7 //move wd to x6 loop_count
+
+ CMP x7,#16 //Compare wd with 16
+ BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
+ CMP x8,#4 //Compare ht with 4
+ BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP
+
+WIDTH_LOOP_16:
+ mov w7, w24 //Loads wd
+ CMP x6,x7 //col == wd
+ mov x5, x21 //Loads pu1_avail
+
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ LDRB w11,[x5,#2] //pu1_avail[2]
+
+ CMP x6,#16 //if(col == 16)
+ mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ BNE SKIP_AU1_MASK_VAL
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL:
+ CMP x11,#0
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //SUB x0, x0,#8
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+ movi v18.16b, #0
+ csel x8, x3, x8,NE
+
+ ADD x8,x8,#2 //pu1_src - src_strd + 2
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ //SUB x8, x8,#8
+ ADD x3,x3,#16
+
+ mov w4, w25 //Loads ht
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ mov w7, w24 //Loads wd
+
+ SUB x7,x7,x6 //(wd - col)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD x7,x7,#14 //15 + (wd - col)
+
+ mov x8, x26 //Loads *pu1_src
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP:
+ LDRH w8,[x7] //load the value and increment by src_strd
+ SUBS x4,x4,#1 //decrement the loop count
+
+ STRH w8,[x5],#2 //store it in the stack pointer
+ ADD x7,x7,x1
+ BNE AU1_SRC_LEFT_LOOP
+
+
+ MOV x7,x12 //row count, move ht_tmp to x7
+ movi v18.16b, #0 //I
+ ADD x11,x0,x1 //I *pu1_src + src_strd
+
+ SUB x5,x12,x7 //I ht_tmp - row
+ LD1 {v16.16b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x11, x11,#8
+ ADD x8,x14,x5,LSL #1 //I pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ LDRH w5,[x8,#2] //I
+ mov v18.4h[7], w5 //I vsetq_lane_u8
+ mov x11, x21 //I Loads pu1_avail
+
+ LDRB w11,[x11,#2] //I pu1_avail[2]
+ EXT v18.16b, v18.16b , v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ CMP x11,#0 //I
+ BNE SIGN_UP_CHANGE_DONE //I
+
+ LDRB w8,[x0,#14] //I pu1_src_cpy[14]
+ SUB x5,x0,x1 //I
+
+ LDRB w11,[x5,#16] //I load the value pu1_src_cpy[16 - src_strd]
+
+ LDRB w9,[x0,#15] //I pu1_src_cpy[15]
+ SUB x8,x8,x11 //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ LDRB w10,[x5,#17] //I load the value pu1_src_cpy[17 - src_strd]
+ CMP x8,#0 //I
+
+ movn x20,#0
+ csel x8, x20, x8,LT //I
+ SUB x9,x9,x10 //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MOV x20,#1
+ csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+ CMP x9,#0 //I
+
+ movn x20,#0
+ csel x9, x20, x9,LT //I
+ mov v14.16b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ MOV x20,#1
+ csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ mov v14.16b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE:
+ LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ cmhi v20.16b, v12.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ cmhi v22.16b, v18.16b , v12.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v18.16b, v0.16b , v14.16b //I edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v18.16b, {v28.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v22.16b //I sign_up = vnegq_s8(sign_down)
+
+ //TBL v19.8b, {v28.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
+
+ Uxtl v20.8h, v12.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v8.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v19.d[0],v18.d[1]
+
+ UZP1 v31.8b, v18.8b, v19.8b
+ UZP2 v19.8b, v18.8b, v19.8b //I
+ mov v18.8b,v31.8b
+ TBL v22.8b, {v6.16b},v18.8b //I
+ TBL v23.8b, {v7.16b},v19.8b //I
+ ZIP1 v31.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b //I
+ mov v22.8b,v31.8b
+
+ Uxtl2 v18.8h, v12.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ mov v12.16b, v16.16b //I pu1_cur_row = pu1_next_row
+ SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1
+ SMAX v18.8h, v18.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ UMIN v18.8h, v18.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+PU1_SRC_LOOP:
+ ADD x11,x0,x1,LSL #1 //II *pu1_src + src_strd
+ xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUB x5,x12,x7 //II ht_tmp - row
+
+ ADD x4,x0,x1 //III *pu1_src + src_strd
+ xtn2 v20.16b, v18.8h //I vmovn_s16(pi2_tmp_cur_row.val[1])
+ ADD x8,x14,x5,LSL #1 //II pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ LDRH w9,[x8,#2]
+ LD1 {v16.16b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x11, x11,#8
+ LDRB w10,[x4,#14] //II pu1_src_cpy[14]
+
+ LDRB w8,[x4,#15] //II pu1_src_cpy[15]
+ mov v28.4h[7], w9 //II vsetq_lane_u8
+ ADD x4,x11,x1 //III *pu1_src + src_strd
+
+ LDRB w5,[x0,#17] //II load the value pu1_src_cpy[17 - src_strd]
+ LD1 {v30.16b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v31.8b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x4, x4,#8
+ LDRB w11,[x0,#16] //II load the value pu1_src_cpy[16 - src_strd]
+
+ SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1
+ ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ SUB x10,x10,x11 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ CMP x10,#0 //II
+ EXT v28.16b, v28.16b , v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ SUB x8,x8,x5 //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ movn x20,#0
+ csel x10, x20, x10,LT //II
+ LD1 {v21.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOV x20,#1
+ csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP x8,#0 //II
+ mov v14.8b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ movn x20,#0
+ csel x8, x20, x8,LT //II
+
+ MOV x20,#1
+ csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ SUB x10,x12,x7 //III ht_tmp - row
+ mov v14.8b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ CMP x7,#1 //III
+ cmhi v22.16b, v12.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+ BNE NEXT_ROW_POINTER_ASSIGNED_2 //III
+
+ mov x5, x21 //III Loads pu1_avail
+ LDRB w5,[x5,#3] //III pu1_avail[3]
+ CMP x5,#0 //III
+ SUB x20,x4,#4 //III pu1_src[src_strd - 2]
+ csel x11, x20, x11,NE
+
+NEXT_ROW_POINTER_ASSIGNED_2:
+ LDRH w5,[x11,#2] //III
+ cmhi v24.16b, v28.16b , v12.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ ADD x11,x0,x1 //III
+
+ LDRB w9,[x11,#14] //III pu1_src_cpy[14]
+ mov v18.4h[7], w5 //III vsetq_lane_u8
+ LDRB w8,[x11,#15] //III pu1_src_cpy[15]
+
+ LDRB w11,[x0,#16] //III load the value pu1_src_cpy[16 - src_strd]
+ SUB v24.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ LDRB w10,[x0,#17] //III load the value pu1_src_cpy[17 - src_strd]
+
+ SUB x9,x9,x11 //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+ EXT v18.16b, v18.16b , v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ SUB x10,x8,x10 //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ CMP x9,#0 //III
+ ADD v26.16b, v0.16b , v14.16b //II edge_idx = vaddq_s8(const_2, sign_up)
+ movn x20,#0
+ csel x9, x20, x9,LT //III
+
+ MOV x20,#1
+ csel x9, x20, x9,GT //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+ ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
+ CMP x10,#0 //III
+
+ NEG v14.16b, v24.16b //II sign_up = vnegq_s8(sign_down)
+ TBL v26.16b, {v21.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ movn x20,#0
+ csel x10, x20, x10,LT //III
+ MOV x20,#1
+ csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ EXT v14.16b, v14.16b , v14.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
+ //TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ mov v14.16b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ AND v26.16b, v26.16b , v8.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v27.d[0],v26.d[1]
+
+ mov v14.16b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ UZP1 v31.8b, v26.8b, v27.8b
+ UZP2 v27.8b, v26.8b, v27.8b //II
+ mov v26.8b,v31.8b
+
+ cmhi v20.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ TBL v24.8b, {v6.16b},v26.8b //II
+ SUB v22.16b, v20.16b , v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v18.16b, v0.16b , v14.16b //III edge_idx = vaddq_s8(const_2, sign_up)
+ TBL v25.8b, {v7.16b},v27.8b //II
+ ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ ZIP1 v31.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b //II
+ mov v24.8b,v31.8b
+
+ Uxtl v28.8h, v12.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ NEG v14.16b, v22.16b //III sign_up = vnegq_s8(sign_down)
+
+ SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
+
+ Uxtl2 v26.8h, v12.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ AND v18.16b, v18.16b , v8.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v19.d[0],v18.d[1]
+
+ Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ UZP1 v31.8b, v18.8b, v19.8b
+ UZP2 v19.8b, v18.8b, v19.8b //III
+ mov v18.8b,v31.8b
+
+ SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ TBL v22.8b, {v6.16b},v18.8b //III
+ UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ TBL v23.8b, {v7.16b},v19.8b //III
+ SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ ZIP1 v31.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b //III
+ mov v22.8b,v31.8b
+
+ xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0])
+ SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ mov v12.16b, v30.16b //III pu1_cur_row = pu1_next_row
+ UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1
+ SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ CMP x7,#1 //III
+
+ xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1])
+ UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SADDW v18.8h, v18.8h , v23.8b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+
+ ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP
+ BLT INNER_LOOP_DONE
+
+
+ ADD x11,x0,x1,LSL #1 //*pu1_src + src_strd
+ xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0])
+ SUB x5,x12,x7 //ht_tmp - row
+
+ ADD x8,x14,x5,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2]
+ xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1])
+ CMP x7,#1
+
+ LDRB w4,[x0,#16] //load the value pu1_src_cpy[16 - src_strd]
+ LD1 {v16.16b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x11, x11,#8
+ LDRB w9,[x0,#17] //load the value pu1_src_cpy[17 - src_strd]
+
+ BNE NEXT_ROW_POINTER_ASSIGNED_3
+ mov x5, x21 //Loads pu1_avail
+ LDRB w5,[x5,#3] //pu1_avail[3]
+ CMP x5,#0
+ SUB x20,x11,#4 //pu1_src[src_strd - 2]
+ csel x8, x20, x8,NE
+
+NEXT_ROW_POINTER_ASSIGNED_3:
+ LDRH w5,[x8,#2]
+ ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ LDRB w8,[x0,#14] //pu1_src_cpy[14]
+
+ SUB x8,x8,x4 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+ mov v18.4h[7], w5 //vsetq_lane_u8
+ LDRB w10,[x0,#15] //pu1_src_cpy[15]
+
+ CMP x8,#0
+ EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+ SUB x10,x10,x9 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ movn x20,#0
+ csel x8, x20, x8,LT
+ LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP x10,#0
+ mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ movn x20,#0
+ csel x10, x20, x10,LT
+
+ MOV x20,#1
+ csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+ cmhi v20.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ cmhi v22.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v22.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v18.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v18.16b, v18.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+ TBL v18.16b, {v28.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+ //TBL v19.8b, {v28.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+
+ AND v18.16b, v18.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v19.d[0],v18.d[1]
+
+ Uxtl v20.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ UZP1 v31.8b, v18.8b, v19.8b
+ UZP2 v19.8b, v18.8b, v19.8b
+ mov v18.8b,v31.8b
+
+ TBL v22.8b, {v6.16b},v18.8b
+ TBL v23.8b, {v7.16b},v19.8b
+
+ Uxtl2 v18.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ ZIP1 v31.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.8b,v31.8b
+
+ SADDW v20.8h, v20.8h , v22.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+ SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SADDW v18.8h, v18.8h , v23.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+ SMAX v18.8h, v18.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+
+INNER_LOOP_DONE:
+
+ mov w8, w25 //Loads ht
+ xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0])
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+
+ LSL x8,x8,#1
+ xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1])
+ mov x11, x27 //Loads *pu1_src_left
+
+SRC_LEFT_LOOP:
+ LDR w7, [x5],#4 //au1_src_left_tmp[row]
+ SUBS x8,x8,#4
+ STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP
+
+ SUBS x6,x6,#16 //Decrement the wd loop count by 16
+ ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ CMP x6,#8 //Check whether residue remains
+
+ BLT RE_ASSINING_LOOP //Jump to re-assigning loop
+ mov w7, w24 //Loads wd
+ mov x0, x28 //Loads *pu1_src
+ SUB x7,x7,x6
+ ADD x0,x0,x7
+ BGT WIDTH_LOOP_16 //If not equal jump to width_loop
+ BEQ WIDTH_RESIDUE //If residue remains jump to residue loop
+
+WD_16_HT_4_LOOP:
+ mov w7, w24 //Loads wd
+
+ mov x5, x21 //Loads pu1_avail
+ CMP x6,x7 //col == wd
+
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ CMP x6,#16 //if(col == 16)
+ mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+
+ BNE SKIP_AU1_MASK_VAL_WD_16_HT_4
+ LDRB w8,[x5,#1] //pu1_avail[1]
+ mov v8.16b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov v8.16b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+
+SKIP_AU1_MASK_VAL_WD_16_HT_4:
+ LDRB w11,[x5,#2] //pu1_avail[2]
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x8, x20, x8,EQ
+
+ CMP x11,#0
+ csel x8, x3, x8,NE
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //SUB x0, x0,#8
+ ADD x8,x8,#2 //pu1_src - src_strd + 2
+
+ ADD x3,x3,#16
+ LD1 {v10.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ //SUB x8, x8,#8
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+
+ mov w4, w25 //Loads ht
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+ mov w7, w24 //Loads wd
+
+ SUB x7,x7,x6 //(wd - col)
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ ADD x7,x7,#14 //15 + (wd - col)
+
+ mov x8, x26 //Loads *pu1_src
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)]
+
+AU1_SRC_LEFT_LOOP_WD_16_HT_4:
+ LDRH w8,[x7] //load the value and increment by src_strd
+ SUBS x4,x4,#1 //decrement the loop count
+
+ STRH w8,[x5],#2 //store it in the stack pointer
+ ADD x7,x7,x1
+ BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4
+
+ movi v18.16b, #0
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_WD_16_HT_4:
+ ADD x9,x0,x1 //*pu1_src + src_strd
+
+ mov x5, x21 //Loads pu1_avail
+ LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x9, x9,#8
+ LDRB w5,[x5,#3] //pu1_avail[3]
+
+ SUB x11,x12,x7 //ht_tmp - row
+ ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2]
+ ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+ CMP x5,#0
+ BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
+ CMP x7,#1
+ SUB x20,x9,#2 //pu1_src[src_strd - 2]
+ csel x8, x20, x8,EQ
+
+NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
+ LDRH w5,[x8]
+ mov v18.8h[7], w5 //vsetq_lane_u8
+ EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+ CMP x7,x12
+ BLT SIGN_UP_CHANGE_WD_16_HT_4
+ mov x5, x21 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4
+
+SIGN_UP_CHANGE_WD_16_HT_4:
+ LDRB w8,[x0,#14] //pu1_src_cpy[14]
+ SUB x9,x0,x1
+
+ LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd]
+
+ LDRB w10,[x0,#15] //pu1_src_cpy[15]
+ SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd]
+ CMP x8,#0
+
+ movn x20,#0
+ csel x8, x20, x8,LT
+ SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP x10,#0
+ mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ movn x20,#0
+ csel x10, x20, x10,LT
+
+ MOV x20,#1
+ csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_WD_16_HT_4:
+ LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ mov v20.d[1],v20.d[0]
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
+
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v27.d[0],v26.d[1]
+
+ UZP1 v31.8b, v26.8b, v27.8b
+ UZP2 v27.8b, v26.8b, v27.8b
+ mov v26.8b,v31.8b
+ TBL v24.8b, {v6.16b},v26.8b
+ TBL v25.8b, {v7.16b},v27.8b
+ ZIP1 v31.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v31.8b
+
+ Uxtl2 v30.8h, v12.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SADDW v30.8h, v30.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
+
+ SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
+ UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
+
+ xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+ xtn2 v28.16b, v30.8h //vmovn_s16(pi2_tmp_cur_row.val[1])
+
+ SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
+ ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+ BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
+
+ mov w8, w25 //Loads ht
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+ mov x11, x27 //Loads *pu1_src_left
+
+SRC_LEFT_LOOP_WD_16_HT_4:
+ LDR w7, [x5],#4 //au1_src_left_tmp[row]
+ SUBS x8,x8,#2
+ STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_WD_16_HT_4
+
+ SUBS x6,x6,#16 //Decrement the wd loop count by 16
+ BLE RE_ASSINING_LOOP //Jump to re-assigning loop
+ BGT WD_16_HT_4_LOOP //If not equal jump to width_loop
+
+WIDTH_RESIDUE:
+ mov w7, w24 //Loads wd
+
+ mov x5, x21 //Loads pu1_avail
+ CMP x6,x7 //wd_residue == wd
+
+ LDRb w20, [x5] //pu1_avail[0]
+ csel w8,w20,w8,EQ
+
+ MOV x20,#-1
+ csel x8, x20, x8,NE
+ LDRB w11,[x5,#1] //pu1_avail[1]
+
+ LDRB w9,[x5,#2] //pu1_avail[2]
+ mov v8.8b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ CMP x9,#0
+
+ SUB x20,x0,x1 //pu1_src - src_strd
+ csel x10, x20, x10,EQ
+ mov v8.8b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
+ csel x10, x3, x10,NE
+
+ ADD x10,x10,#2 //pu1_src - src_strd + 2
+ mov v8.8b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+
+ mov w4, w25 //Loads ht
+ mov v8.8b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
+ mov w7, w24 //Loads wd
+
+ mov x8, x26 //Loads *pu1_src
+ LD1 {v10.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ //LD1 {v11.8b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
+ //SUB x10, x10,#8
+ SUB x7,x7,#2 //(wd - 2)
+
+ ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 2)]
+
+AU1_SRC_LEFT_LOOP_RESIDUE:
+ LDRH w8,[x7] //load the value and increment by src_strd
+ ADD x7,x7,x1
+ STRH w8,[x5],#2 //store it in the stack pointer
+ SUBS x4,x4,#1 //decrement the loop count
+ BNE AU1_SRC_LEFT_LOOP_RESIDUE
+
+ LD1 {v12.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src)
+ //SUB x0, x0,#8
+
+ movi v18.16b, #0
+ cmhi v14.16b, v12.16b , v10.16b //vcgtq_u8(pu1_cur_row, pu1_top_row)
+
+ cmhi v16.16b, v10.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
+ SUB v14.16b, v16.16b , v14.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+ MOV x7,x12 //row count, move ht_tmp to x7
+
+PU1_SRC_LOOP_RESIDUE:
+ ADD x9,x0,x1 //*pu1_src + src_strd
+
+ SUB x11,x12,x7 //ht_tmp - row
+ LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
+ //SUB x9, x9,#8
+ mov x5, x21 //Loads pu1_avail
+
+ LDRB w5,[x5,#3] //pu1_avail[3]
+ ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2]
+
+ CMP x5,#0
+ ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
+
+ BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE
+ CMP x7,#1
+ SUB x20,x9,#2 //pu1_src[src_strd - 2]
+ csel x8, x20, x8,EQ
+
+NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
+ LDRB w5,[x8]
+
+ LDRB w8,[x8,#1]
+ mov v18.16b[14], w5 //vsetq_lane_u8
+ CMP x7,x12
+
+ mov v18.16b[15], w8 //vsetq_lane_u8
+ EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
+
+ BLT SIGN_UP_CHANGE_RESIDUE
+ mov x5, x21 //Loads pu1_avail
+ LDRB w5,[x5,#2] //pu1_avail[2]
+ CMP x5,#0
+ BNE SIGN_UP_CHANGE_DONE_RESIDUE
+
+SIGN_UP_CHANGE_RESIDUE:
+ LDRB w8,[x0,#14] //pu1_src_cpy[14]
+ SUB x9,x0,x1
+
+ LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd]
+
+ LDRB w10,[x0,#15] //pu1_src_cpy[15]
+ SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
+
+ LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd]
+ CMP x8,#0
+
+ movn x20,#0
+ csel x8, x20, x8,LT
+ SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+
+ MOV x20,#1
+ csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
+
+ CMP x10,#0
+ mov v14.16b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
+ movn x20,#0
+ csel x10, x20, x10,LT
+
+ MOV x20,#1
+ csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
+ mov v14.16b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
+
+SIGN_UP_CHANGE_DONE_RESIDUE:
+ LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
+ cmhi v22.16b, v12.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
+
+ cmhi v24.16b, v18.16b , v12.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
+ SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
+
+ ADD v26.16b, v0.16b , v14.16b //edge_idx = vaddq_s8(const_2, sign_up)
+ ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
+
+ mov v20.d[1],v20.d[0]
+ NEG v14.16b, v24.16b //sign_up = vnegq_s8(sign_down)
+ TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
+
+ //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
+ EXT v14.16b, v14.16b , v14.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
+
+ Uxtl v28.8h, v12.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
+ AND v26.16b, v26.16b , v8.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
+ mov v27.d[0],v26.d[1]
+
+ UZP1 v31.8b, v26.8b, v27.8b
+ UZP2 v27.8b, v26.8b, v27.8b
+ mov v26.8b,v31.8b
+ TBL v24.8b, {v6.16b},v26.8b
+ TBL v25.8b, {v7.16b},v27.8b
+ ZIP1 v31.8b, v24.8b, v25.8b
+ ZIP2 v25.8b, v24.8b, v25.8b
+ mov v24.8b,v31.8b
+
+ mov v12.16b, v16.16b //pu1_cur_row = pu1_next_row
+ SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
+
+ SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
+ UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
+
+ SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1
+ xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0])
+
+ ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row)
+
+ BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP
+
+ mov w8, w25 //Loads ht
+ ADD x5,sp,#0x4B //*au1_src_left_tmp
+
+ mov x11, x27 //Loads *pu1_src_left
+
+SRC_LEFT_LOOP_RESIDUE:
+ LDR w7, [x5],#4 //au1_src_left_tmp[row]
+ SUBS x8,x8,#2
+ STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row]
+ BNE SRC_LEFT_LOOP_RESIDUE
+
+
+RE_ASSINING_LOOP:
+ mov w7, w24 //Loads wd
+ mov w8, w25 //Loads ht
+
+ mov x0, x26 //Loads *pu1_src
+ SUB x10,x7,#2 //wd - 2
+
+ LDRH w9,[sp,#6]
+ SUB x8,x8,#1 //ht - 1
+
+ STRH w9,[x0,x10] //pu1_src_org[0] = u1_pos_0_0_tmp
+ madd x6, x8, x1, x0 //pu1_src[(ht - 1) * src_strd]
+
+ mov x4, x15 //Loads pu1_src_top_left
+
+ LDRH w9,[sp,#8]
+ ADD x12,sp,#10
+
+ STRH w9,[x6] //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
+
+ LDRH w10,[sp] //load u1_src_top_left_tmp from stack pointer
+ STRH w10,[x4] //*pu1_src_top_left = u1_src_top_left_tmp
+ mov x3, x22 //Loads pu1_src_top
+
+SRC_TOP_LOOP:
+ LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ SUBS x7,x7,#8 //Decrement the width
+ ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col]
+ BNE SRC_TOP_LOOP
+
+END_LOOPS:
+ ADD sp,sp,#0xE0
+ // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP
+ ldp x27, x28,[sp],#16
+ ldp x25, x26,[sp],#16
+ ldp x23, x24,[sp],#16
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
diff --git a/common/arm64/ihevc_weighted_pred_bi.s b/common/arm64/ihevc_weighted_pred_bi.s
new file mode 100644
index 0000000..6851cb4
--- /dev/null
+++ b/common/arm64/ihevc_weighted_pred_bi.s
@@ -0,0 +1,316 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* //file
+//* ihevc_weighted_pred_bi.s
+//*
+//* //brief
+//* contains function definitions for weighted prediction used in inter
+//* prediction
+//*
+//* //author
+//* parthiban v
+//*
+//* //par list of functions:
+//* - ihevc_weighted_pred_bi()
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* //brief
+//* does bi-weighted prediction on the arrays pointed by pi2_src1 and
+//* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the
+//* function is optimized considering the fact width and height are multiple
+//* of 2.
+//*
+//* //par description:
+//* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+//* off1 + 1) << (shift - 1) ) >> shift
+//*
+//* //param[in] pi2_src1
+//* pointer to source 1
+//*
+//* //param[in] pi2_src2
+//* pointer to source 2
+//*
+//* //param[out] pu1_dst
+//* pointer to destination
+//*
+//* //param[in] src_strd1
+//* source stride 1
+//*
+//* //param[in] src_strd2
+//* source stride 2
+//*
+//* //param[in] dst_strd
+//* destination stride
+//*
+//* //param[in] wgt0
+//* weight to be multiplied to source 1
+//*
+//* //param[in] off0
+//* offset 0
+//*
+//* //param[in] wgt1
+//* weight to be multiplied to source 2
+//*
+//* //param[in] off1
+//* offset 1
+//*
+//* //param[in] shift
+//* (14 bit depth) + log2_weight_denominator
+//*
+//* //param[in] lvl_shift1
+//* added before shift and offset
+//*
+//* //param[in] lvl_shift2
+//* added before shift and offset
+//*
+//* //param[in] ht
+//* height of the source
+//*
+//* //param[in] wd
+//* width of the source
+//*
+//* //returns
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_weighted_pred_bi(word16 *pi2_src1,
+// word16 *pi2_src2,
+// uword8 *pu1_dst,
+// word32 src_strd1,
+// word32 src_strd2,
+// word32 dst_strd,
+// word32 wgt0,
+// word32 off0,
+// word32 wgt1,
+// word32 off1,
+// word32 shift,
+// word32 lvl_shift1,
+// word32 lvl_shift2,
+// word32 ht,
+// word32 wd)
+
+//**************variables vs registers*****************************************
+// x0 => *pi2_src1
+// x1 => *pi2_src2
+// x2 => *pu1_dst
+// x3 => src_strd1
+// x4 => src_strd2
+// x5 => dst_strd
+// x6 => wgt0
+// x7 => off0
+// x8 => wgt1
+// x9 => off1
+// x10 => shift
+// x11 => lvl_shift1
+// x12 => lvl_shift2
+// x14 => ht
+// x7 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_weighted_pred_bi_av8
+
+.type ihevc_weighted_pred_bi_av8, %function
+
+ihevc_weighted_pred_bi_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+
+ ldr w8,[sp,#0]
+ ldr w9,[sp,#8]
+ ldr w10,[sp,#16]
+ ldr w11,[sp,#24]
+ ldr w12,[sp,#32]
+ ldr w13,[sp,#40]
+ ldr w14,[sp,#48]
+
+ sxtw x8,w8
+ sxtw x9,w9
+ sxtw x10,w10
+ sxtw x11,w11
+ sxtw x12,w12
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+ stp x23, x24,[sp,#-16]!
+ stp x25, x26,[sp,#-16]!
+
+ mov x15,x4 // src_strd2 40
+ mov x16,x5 // dst_strd 44
+ mov x17,x6 // wgt0 48
+ mov x19,x7 // off0 52
+ mov x20,x8 // wgt1 56
+ mov x21,x9 // off1 60
+ mov x22,x10 // shift 64
+ mov x23,x11 // lvl_shift1 68
+ mov x24,x12 // lvl_shift2 72
+ mov x25,x13 // ht 76
+ mov x26,x14 // wd 80
+
+ mov x6,x17 //load wgt0
+ mov x11,x23 //load lvl_shift1
+ mov x12,x24 //load lvl_shift2
+ mov v7.h[0],w6 //moved for scalar multiplication
+ mul x4, x11 , x6 //lvl_shift1 * wgt0
+ mov x8,x20 //load wgt1
+ mov x7,x19 //load off0
+ mov v7.h[1],w8 //moved for scalar multiplication
+ madd x4,x12,x8,x4 //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
+ mov x9,x21 //load off1
+ add x5,x7,x9 //off0 + off1
+ mov x10,x22 //load shift
+ add x5,x5,#1 //off0 + off1 + 1
+ sub x14,x10,#1 //shift - 1
+ mov x7,x26 //load wd
+ lsl x5,x5,x14 //((off0 + off1 + 1) << (shift - 1))
+ dup v28.4s,w10 //vmovq_n_s32(0-shift)
+ add x4,x4,x5 //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
+ dup v30.4s,w4 //vmovq_n_s32(tmp_lvl_shift)
+ neg v28.4s, v28.4s
+ mov x4,x15 //load src_strd2
+ lsl x9,x7,#1
+ mov x5,x16 //load dst_strd
+ lsl x3,x3,#1
+ mov x14,x25 //load ht
+ lsl x4,x4,#1
+
+ cmp x14,#0 //check ht == 0
+ beq end_loops //if equal, then end the function
+
+outer_loop:
+ cmp x7,#0 //check wd == 0
+ beq end_loops //if equal, then end the function
+
+core_loop:
+ add x6,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x8,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ ld1 {v0.4h},[x0],#8 //load and increment the pi2_src1
+ add x10,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
+ ld1 {v1.4h},[x1],#8 //load and increment the pi2_src2
+ smull v4.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
+ ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 ii iteration
+ smull v8.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
+ ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 ii iteration
+ add v4.4s, v4.4s , v8.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
+
+ ld1 {v0.4h},[x6],x3 //load and increment the pi2_src1 iii iteration
+ smull v10.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
+
+ ld1 {v1.4h},[x8],x4 //load and increment the pi2_src2 iii iteration
+ add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ smull v14.4s, v0.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
+
+ ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 iv iteration
+ smull v12.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
+ sshl v4.4s,v4.4s,v28.4s //vshlq_s32(i4_tmp1_t1, tmp_shift_t)
+
+ ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 iv iteration
+ add v10.4s, v10.4s , v12.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
+
+ sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
+ smull v16.4s, v1.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
+
+ add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
+ //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+ add v14.4s, v14.4s , v16.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
+
+ sshl v10.4s,v10.4s,v28.4s
+ //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
+ smull v18.4s, v2.4h, v7.4h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
+ uqxtn v4.8b,v4.8h
+ //vqmovn.u16 d4,q2 //vqmovn_u16(sto_res_tmp3)
+ add v14.4s, v14.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+
+ sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) ii iteration
+ smull v20.4s, v3.4h, v7.4h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
+
+ sshl v14.4s,v14.4s,v28.4s
+ //vshl.s32 q7,q7,q14 //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
+ //mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+ add v18.4s, v18.4s , v20.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+ sqxtun v14.4h, v14.4s //vqmovun_s32(sto_res_tmp1) iii iteration
+
+ add v18.4s, v18.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
+ st1 {v4.s}[0],[x2],#4 //store pu1_dst i iteration
+
+ uqxtn v10.8b,v10.8h
+ //vqmovn.u16 d10,q5 //vqmovn_u16(sto_res_tmp3) ii iteration
+ sshl v18.4s,v18.4s,v28.4s
+ //vshl.s32 q9,q9,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
+ st1 {v10.s}[0],[x10],x5 //store pu1_dst ii iteration
+
+
+ //mov v15, v14 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+ uqxtn v14.8b,v14.8h
+ //vqmovn.u16 d14,q7 //vqmovn_u16(sto_res_tmp3) iii iteration
+ sqxtun v18.4h, v18.4s //vqmovun_s32(sto_res_tmp1) iv iteration
+ //mov v19, v18 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+ st1 {v14.s}[0],[x10],x5 //store pu1_dst iii iteration
+ uqxtn v18.8b,v18.8h
+ //vqmovn.u16 d18,q9 //vqmovn_u16(sto_res_tmp3) iv iteration
+ subs x7,x7,#4 //decrement wd by 4 and check for 0
+ st1 {v18.s}[0],[x10],x5 //store pu1_dst iv iteration
+
+ bgt core_loop //if greater than 0 repeat the core loop again
+
+end_core_loop:
+ sub x20,x9,x3,lsl #2 //2*src_strd1 - wd
+ neg x11, x20
+ subs x14,x14,#4 //decrement the ht by 4
+ sub x20,x9,x4,lsl #2 //2*src_strd2 - wd
+ neg x12, x20
+ add x0,x0,x11 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr x7,x9,#1
+ add x1,x1,x12 //pi2_src2 + 4*src_strd2 - 2*wd
+ sub x20,x7,x5,lsl #2 //2*dst_strd - wd
+ neg x10, x20
+ add x2,x2,x10 //pu1_dst + dst_std - wd
+ bgt core_loop //if ht is greater than 0 goto outer_loop
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x25, x26,[sp],#16
+ ldp x23, x24,[sp],#16
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/common/arm64/ihevc_weighted_pred_bi_default.s b/common/arm64/ihevc_weighted_pred_bi_default.s
new file mode 100644
index 0000000..07fb4ce
--- /dev/null
+++ b/common/arm64/ihevc_weighted_pred_bi_default.s
@@ -0,0 +1,541 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_weighted_pred_bi_default.s
+//*
+//* @brief
+//* contains function definitions for weighted prediction used in inter
+//* prediction
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//* - ihevc_weighted_pred_bi_default()
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+//* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the
+//* function is optimized considering the fact width and height are multiple
+//* of 2.
+//*
+//* @par description:
+//* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+//* >> shift where shift = 15 - bitdepth
+//*
+//* @param[in] pi2_src1
+//* pointer to source 1
+//*
+//* @param[in] pi2_src2
+//* pointer to source 2
+//*
+//* @param[out] pu1_dst
+//* pointer to destination
+//*
+//* @param[in] src_strd1
+//* source stride 1
+//*
+//* @param[in] src_strd2
+//* source stride 2
+//*
+//* @param[in] dst_strd
+//* destination stride
+//*
+//* @param[in] lvl_shift1
+//* added before shift and offset
+//*
+//* @param[in] lvl_shift2
+//* added before shift and offset
+//*
+//* @param[in] ht
+//* height of the source
+//*
+//* @param[in] wd
+//* width of the source
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+//void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
+// word16 *pi2_src2,
+// uword8 *pu1_dst,
+// word32 src_strd1,
+// word32 src_strd2,
+// word32 dst_strd,
+// word32 lvl_shift1,
+// word32 lvl_shift2,
+// word32 ht,
+// word32 wd)
+
+//**************variables vs registers*****************************************
+// x0 => *pi2_src1
+// x1 => *pi2_src2
+// x2 => *pu1_dst
+// x3 => src_strd1
+// x4 => src_strd2
+// x5 => dst_strd
+// x6 => lvl_shift1
+// x7 => lvl_shift2
+// x8 => ht
+// x9 => wd
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_weighted_pred_bi_default_av8
+
+.type ihevc_weighted_pred_bi_default_av8, %function
+
+ihevc_weighted_pred_bi_default_av8:
+
+ ldr w8,[sp,#0]
+ ldr w9,[sp,#8]
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+
+ mov x15,x4 // src_strd2 40
+ mov x16,x5 // dst_strd 44
+ mov x17,x6 // lvl_shift1 48
+ mov x19,x7 // lvl_shift2 52
+ mov x20,x8 // ht 56
+ mov x21,x9 // wd 60
+
+ mov x4,x15 //load src_strd2
+ lsl x3,x3,#1
+ mov x5,x16 //load dst_strd
+ mov x6,x17 //load lvl_shift1
+ lsl x4,x4,#1
+ mov x7,x19 //load lvl_shift2
+ mov x8,x20 //load ht
+ mov x9,x21 //load wd
+ dup v4.8h,w6 //lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
+ dup v6.8h,w7 //lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
+ movi v0.8h, #0x40 //tmp_lvl_shift = 1 << (shift - 1)
+ add v4.8h, v4.8h,v6.8h
+ add v0.8h, v0.8h , v4.8h
+// vmvn.i32 v2.8h,#0x6 @vmovq_n_s32(tmp_shift)
+ lsl x6,x9,#1
+ sub x20,x6,x3,lsl #2 //4*src_strd1 - wd
+ neg x7, x20
+ sub x20,x6,x4,lsl #2 //4*src_strd2 - wd
+ neg x10, x20
+ //asr x6,#1
+ //rsb x6,x6,x5,lsl #2 @4*dst_strd - wd
+
+ cmp x8,#0 //check ht == 0
+ beq end_loops //if equal, then end the function
+
+chroma_decision:
+ orr x14,x8,x9
+ cmp x14,#10
+ beq outer_loop_chroma_8x2
+
+ cmp x14,#6
+ beq outer_loop_chroma_4x2
+
+
+luma_decision:
+ cmp x9,#24
+ beq outer_loop_8
+
+ cmp x9,#16
+ bge outer_loop_16
+
+ cmp x9,#12
+ beq outer_loop_4
+
+ cmp x9,#8
+ bge outer_loop_8
+
+
+
+
+
+
+outer_loop_4:
+ cmp x9,#0 //check wd == 0
+ beq end_loops //if equal, then end the function
+
+core_loop_4:
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1
+ add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
+ ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2
+ ld1 {v8.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
+ sqadd v18.4h,v6.4h,v7.4h
+ sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ ld1 {v9.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
+ sqadd v20.4h,v8.4h,v9.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ mov v18.d[1],v19.d[0]
+ sqshrun v20.8b, v18.8h,#7
+ ld1 {v22.4h},[x11],x3 //load and increment the pi2_src1 iii iteration
+ ld1 {v23.4h},[x12],x4 //load and increment the pi2_src2 iii iteration
+ sqadd v30.4h,v22.4h,v23.4h
+ sqadd v30.4h,v30.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+ ld1 {v24.4h},[x11],x3 //load and increment the pi2_src1 iv iteration
+ ld1 {v25.4h},[x12],x4 //load and increment the pi2_src2 iv iteration
+ sqadd v18.4h,v24.4h,v25.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+ sqadd v31.4h,v18.4h,v0.4h
+ mov v30.d[1],v31.d[0]
+ st1 {v20.s}[0],[x2],#4 //store pu1_dst i iteration
+ st1 {v20.s}[1],[x14],x5 //store pu1_dst ii iteration
+ sqshrun v30.8b, v30.8h,#7
+ st1 {v30.s}[0],[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+ subs x9,x9,#4 //decrement wd by 4 and check for 0
+ st1 {v30.s}[1],[x14],x5 //store pu1_dst iv iteration
+ bgt core_loop_4 //if greater than 0 repeat the core loop again
+
+end_core_loop_4:
+
+ subs x8,x8,#4 //decrement the ht by 4
+
+ add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr x9,x6,#1
+ add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd
+ sub x20,x9,x5,lsl #2 //4*dst_strd - wd
+ neg x14, x20
+ add x2,x2,x14
+ //pu1_dst + dst_std - wd
+ bgt core_loop_4 //if ht is greater than 0 goto outer_loop
+
+ b end_loops
+
+
+// this is only for chroma module with input 2x2
+outer_loop_chroma_4x2:
+ cmp x9,#0 //check wd == 0
+ beq end_loops //if equal, then end the function
+ sub x20,x6,x3,lsl #1 //2*src_strd1 - wd
+ neg x7, x20
+ sub x20,x6,x4,lsl #1 //2*src_strd2 - wd
+ neg x10, x20
+core_loop_chroma_4x2:
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1
+ add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
+ ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2
+ ld1 {v8.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
+ sqadd v18.4h,v6.4h,v7.4h
+ sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ ld1 {v9.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
+ sqadd v20.4h,v8.4h,v9.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ mov v18.d[1],v19.d[0]
+ sqshrun v20.8b, v18.8h,#7
+ st1 {v20.s}[0],[x2],#4 //store pu1_dst i iteration
+ st1 {v20.s}[1],[x14],x5 //store pu1_dst ii iteration
+
+ subs x9,x9,#4 //decrement wd by 4 and check for 0
+
+ bgt core_loop_chroma_4x2 //if greater than 0 repeat the core loop again
+
+end_core_loop_chorma_4x2:
+
+ subs x8,x8,#2 //decrement the ht by 4
+
+ add x0,x0,x7 //pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr x9,x6,#1
+ add x1,x1,x10 //pi2_src2 + 2*src_strd2 - 2*wd
+ sub x20,x9,x5,lsl #1 //2*dst_strd - wd
+ neg x14, x20
+ add x2,x2,x14
+ //pu1_dst + dst_std - wd
+ bgt core_loop_chroma_4x2 //if ht is greater than 0 goto outer_loop
+
+ b end_loops
+
+
+
+outer_loop_8:
+ cmp x9,#0 //check wd == 0
+ beq end_loops //if equal, then end the function
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+core_loop_8:
+
+ ld1 { v24.8h},[x0],#16 //load and increment the pi2_src1
+ add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
+ ld1 { v26.8h},[x1],#16 //load and increment the pi2_src2
+ sqadd v24.8h,v24.8h,v26.8h
+ ld1 { v28.8h},[x11],x3 //load and increment the pi2_src1 ii iteration
+ sqadd v24.8h,v24.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ ld1 { v30.8h},[x12],x4 //load and increment the pi2_src2 ii iteration
+ ld1 { v16.8h},[x11],x3 //load and increment the pi2_src1 iii iteration
+ sqadd v22.8h,v28.8h,v30.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ ld1 { v18.8h},[x12],x4 //load and increment the pi2_src2 iii iteration
+ sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ sqshrun v20.8b, v24.8h,#7
+ ld1 { v12.8h},[x11],x3 //load and increment the pi2_src1 iv iteration
+ sqadd v30.8h,v16.8h,v18.8h
+ sqshrun v21.8b, v22.8h,#7
+ ld1 { v14.8h},[x12],x4 //load and increment the pi2_src2 iv iteration
+ sqadd v30.8h,v30.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
+ st1 {v20.2s},[x2],#8 //store pu1_dst i iteration
+ sqadd v8.8h,v12.8h,v14.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
+ st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration
+ sqadd v8.8h,v8.8h,v0.8h
+ sqshrun v30.8b, v30.8h,#7
+ sqshrun v31.8b, v8.8h,#7
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ st1 {v30.2s},[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+ subs x9,x9,#8 //decrement wd by 4 and check for 0
+ st1 {v31.2s},[x14],x5 //store pu1_dst iv iteration
+ bgt core_loop_8 //if greater than 0 repeat the core loop again
+
+end_core_loop_8:
+
+ subs x8,x8,#4 //decrement the ht by 4
+
+ add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr x9,x6,#1
+ add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd
+ sub x20,x9,x5,lsl #2 //4*dst_strd - wd
+ neg x14, x20
+ add x2,x2,x14
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) //pu1_dst + dst_std - wd
+
+ bgt core_loop_8
+ b end_loops
+
+
+
+// this is only for chroma module with inpput 4x2
+outer_loop_chroma_8x2:
+ cmp x9,#0 //check wd == 0
+ beq end_loops //if equal, then end the function
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ sub x20,x6,x3,lsl #1 //2*src_strd1 - wd
+ neg x7, x20
+ sub x20,x6,x4,lsl #1 //2*src_strd2 - wd
+ neg x10, x20
+core_loop_chroma_8x2:
+
+ ld1 { v24.8h},[x0],#16 //load and increment the pi2_src1
+ add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
+ ld1 { v26.8h},[x1],#16 //load and increment the pi2_src2
+ sqadd v24.8h,v24.8h,v26.8h
+ ld1 { v28.8h},[x11],x3 //load and increment the pi2_src1 ii iteration
+ sqadd v24.8h,v24.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
+ ld1 { v30.8h},[x12],x4 //load and increment the pi2_src2 ii iteration
+ ld1 { v16.8h},[x11],x3 //load and increment the pi2_src1 iii iteration
+ sqadd v22.8h,v28.8h,v30.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
+ sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
+ sqshrun v20.8b, v24.8h,#7
+ sqshrun v21.8b, v22.8h,#7
+ st1 {v20.2s},[x2],#8 //store pu1_dst i iteration
+ st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration
+
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
+ subs x9,x9,#8 //decrement wd by 4 and check for 0
+
+ bgt core_loop_chroma_8x2 //if greater than 0 repeat the core loop again
+
+end_core_loop_chroma_8x2:
+
+ subs x8,x8,#2 //decrement the ht by 4
+
+ add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
+ asr x9,x6,#1
+ add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd
+ sub x20,x9,x5,lsl #1 //4*dst_strd - wd
+ neg x14, x20
+ add x2,x2,x14
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) //pu1_dst + dst_std - wd
+
+ bgt core_loop_chroma_8x2
+
+ b end_loops
+
+
+
+
+outer_loop_16:
+ cmp x9,#0 //check wd == 0
+ beq end_loops //if equal, then end the function
+ add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
+ sub x20,x6,x3,lsl #1 //2*src_strd1 - wd
+ neg x7, x20
+ mov x14,#16
+ sub x10,x14,x5
+ sub x11,x3,x14
+ sub x12,x14,x3
+
+ sub x20,x9,x5,lsl #1 //2*dst_strd - wd
+ neg x14, x20
+
+
+
+prolog_16:
+
+
+ ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1
+ ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
+ ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1
+ ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2
+ ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
+ subs x9,x9,#16
+ ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
+ sub x20,x8,#2
+ csel x8, x20, x8,eq
+ sqadd v22.8h,v2.8h,v4.8h
+ ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
+ sqadd v28.8h,v10.8h,v12.8h
+ ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
+ add x20,x0,x7
+ csel x0, x20, x0,eq
+ add x20,x1,x7
+ csel x1, x20, x1,eq
+ sqadd v24.8h,v6.8h,v8.8h
+ ld1 { v2.8h},[x0],#16
+ sqadd v26.8h,v14.8h,v16.8h
+// if the input is chroma with 8x2 block size
+ cmp x8,#0
+ beq epilog_16
+
+ ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
+ sqadd v22.8h,v22.8h,v0.8h
+ ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1
+ sqadd v28.8h,v28.8h,v0.8h
+ ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2
+ sqadd v24.8h,v24.8h,v0.8h
+ ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
+ sqadd v30.8h,v26.8h,v0.8h
+ sqshrun v20.8b, v22.8h,#7
+ ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
+ sqshrun v21.8b, v28.8h,#7
+ ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
+ sqshrun v26.8b, v24.8h,#7
+ ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
+ sqshrun v27.8b, v30.8h,#7
+
+
+
+core_loop_16:
+
+ cmp x9,#0
+ sqadd v22.8h,v2.8h,v4.8h
+ asr x20,x6,#1
+ csel x9,x20,x9,eq
+ //asreq x9,x6,#1
+ mov v20.d[1],v21.d[0]
+ mov v26.d[1],v27.d[0]
+ st1 { v20.4s},[x2],x5
+ sqadd v28.8h,v10.8h,v12.8h
+ st1 { v26.4s},[x2],x10
+ add x20,x2,x14
+ csel x2, x20, x2,eq
+ sqadd v24.8h,v6.8h,v8.8h
+ subs x9,x9,#16
+ add x20,x0,x7
+ csel x0, x20, x0,eq
+ sqadd v26.8h,v14.8h,v16.8h
+
+ add x20,x1,x7
+ csel x1, x20, x1,eq
+ sub x20,x8,#2
+ csel x8,x20,x8,eq
+ cmp x8,#0
+ //subeqs x8,x8,#2 //decrement the ht by 2
+ beq epilog_16
+
+
+ sqadd v22.8h,v22.8h,v0.8h
+ ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1
+ sqadd v28.8h,v28.8h,v0.8h
+ ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
+ sqadd v24.8h,v24.8h,v0.8h
+ ld1 { v10.8h},[x0],x11 //load and increment the pi2_src1
+ sqadd v30.8h,v26.8h,v0.8h
+ ld1 { v12.8h},[x1],x11 //load and increment the pi2_src2
+ sqshrun v20.8b, v22.8h,#7
+ ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
+ sqshrun v21.8b, v28.8h,#7
+ ld1 { v8.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
+ sqshrun v26.8b, v24.8h,#7
+ ld1 { v14.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
+ sqshrun v27.8b, v30.8h,#7
+ ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
+
+
+ b core_loop_16
+
+
+epilog_16:
+
+ sqadd v22.8h,v22.8h,v0.8h
+ sqadd v28.8h,v28.8h,v0.8h
+ sqadd v24.8h,v24.8h,v0.8h
+ sqadd v30.8h,v26.8h,v0.8h
+ sqshrun v20.8b, v22.8h,#7
+ sqshrun v21.8b, v28.8h,#7
+ sqshrun v26.8b, v24.8h,#7
+ sqshrun v27.8b, v30.8h,#7
+ mov v20.d[1],v21.d[0]
+ mov v26.d[1],v27.d[0]
+ st1 { v20.4s},[x2],x5
+ st1 { v26.4s},[x2]
+
+
+
+end_core_loop_16:
+
+
+
+
+
+
+
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/arm64/ihevc_weighted_pred_uni.s b/common/arm64/ihevc_weighted_pred_uni.s
new file mode 100644
index 0000000..d805230
--- /dev/null
+++ b/common/arm64/ihevc_weighted_pred_uni.s
@@ -0,0 +1,245 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+//*******************************************************************************
+//* @file
+//* ihevc_weighted_pred_uni.s
+//*
+//* @brief
+//* contains function definitions for weighted prediction used in inter
+//* prediction
+//*
+//* @author
+//* parthiban v
+//*
+//* @par list of functions:
+//* - ihevc_weighted_pred_uni()
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+///**
+//*******************************************************************************
+//*
+//* @brief
+//* does uni-weighted prediction on the array pointed by pi2_src and stores
+//* it at the location pointed by pi2_dst assumptions : the function is
+//* optimized considering the fact width and height are multiple of 2.
+//*
+//* @par description:
+//* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+//* offset
+//*
+//* @param[in] pi2_src
+//* pointer to the source
+//*
+//* @param[out] pu1_dst
+//* pointer to the destination
+//*
+//* @param[in] src_strd
+//* source stride
+//*
+//* @param[in] dst_strd
+//* destination stride
+//*
+//* @param[in] wgt0
+//* weight to be multiplied to the source
+//*
+//* @param[in] off0
+//* offset to be added after rounding and
+//*
+//* @param[in] shifting
+//*
+//*
+//* @param[in] shift
+//* (14 bit depth) + log2_weight_denominator
+//*
+//* @param[in] lvl_shift
+//* added before shift and offset
+//*
+//* @param[in] ht
+//* height of the source
+//*
+//* @param[in] wd
+//* width of the source
+//*
+//* @returns
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************
+//*/
+
+//void ihevc_weighted_pred_uni(word16 *pi2_src,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 dst_strd,
+// word32 wgt0,
+// word32 off0,
+// word32 shift,
+// word32 lvl_shift,
+// word32 ht,
+// word32 wd)
+
+//**************variables vs registers*****************************************
+// x0 => *pi2_src
+// x1 => *pu1_dst
+// x2 => src_strd
+// x3 => dst_strd
+// x4 => wgt0
+// x5 => off0
+// x6 => shift
+// x7 => lvl_shift
+// x8 => ht
+// x9 => wd
+
+.text
+.align 4
+
+.include "ihevc_neon_macros.s"
+
+.globl ihevc_weighted_pred_uni_av8
+
+.type ihevc_weighted_pred_uni_av8, %function
+
+ihevc_weighted_pred_uni_av8:
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+
+ ldr w8,[sp,#0]
+ ldr w9,[sp,#8]
+
+ // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+
+ mov x15,x4 // src_strd2 40
+ mov x16,x5 // dst_strd 44
+ mov x17,x6 // lvl_shift1 48
+ mov x19,x7 // lvl_shift2 52
+ mov x20,x8 // ht 56
+ mov x21,x9 // wd 60
+
+ mov x4,x15 //load wgt0
+ mov x7,x19 //load lvl_shift
+ mov x11,#1
+ mov x5,x16 //load off0
+ mul x10, x7, x4 //lvl_shift * wgt0
+ mov x6,x17 //load shift
+ mov x8,x20 //load ht
+ lsl x22,x5,x6
+ add x10,x10,x22 //lvl_shift * wgt0 + (off0 << shift)
+ mov x9,x21 //load wt
+ sub x12,x6,#1
+ mov v0.4h[0], w4 //moved for scalar multiplication
+ lsl x2,x2,#1
+ dup v28.4s,w6 //vmovq_n_s32(tmp_shift)
+ lsl x22,x11,x12
+ add x10,x10,x22 //tmp_lvl_shift += (1 << (shift - 1))
+ dup v30.4s,w10 //vmovq_n_s32(tmp_lvl_shift)
+ neg v28.4s, v28.4s
+ lsl x4,x9,#1
+
+ cmp x8,#0 //check ht == 0
+ beq end_loops //if equal, then end the function
+
+outer_loop:
+ cmp x9,#0 //check wd == 0
+ beq end_loops //if equal, then end the function
+
+core_loop:
+ add x5,x0,x2 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
+ add x6,x1,x3 //pu1_dst_tmp = pu1_dst + dst_strd
+ ld1 {v1.4h},[x0],#8 //load and increment the pi2_src
+ ld1 {v2.4h},[x5],x2 //load and increment the pi2_src_tmp ii iteration
+ smull v4.4s, v1.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
+
+ add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
+ ld1 {v8.4h},[x5],x2 //load and increment the pi2_src iii iteration
+
+ smull v6.4s, v2.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
+ ld1 {v9.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration
+
+ sshl v4.4s,v4.4s,v28.4s
+ //vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t)
+ add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
+
+ smull v10.4s, v8.4h, v0.4h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
+ sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1)
+
+ add v10.4s, v10.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
+ //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
+
+ sshl v6.4s,v6.4s,v28.4s
+ //vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
+
+ smull v12.4s, v9.4h, v0.4h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
+ uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3)
+
+ sshl v10.4s,v10.4s,v28.4s
+ //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
+ sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration
+
+ add v12.4s, v12.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
+ //mov v7, v6 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
+
+ sqxtun v10.4h, v10.4s //vqmovun_s32(sto_res_tmp1) iii iteration
+
+ sshl v12.4s,v12.4s,v28.4s
+ //vshl.s32 q6,q6,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
+ st1 {v4.s}[0],[x1],#4 //store pu1_dst i iteration
+ //mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
+
+ uqxtn v6.8b, v6.8h //vqmovn_u16(sto_res_tmp3) ii iteration
+ st1 {v6.s}[0],[x6],x3 //store pu1_dst ii iteration
+
+ uqxtn v10.8b, v10.8h //vqmovn_u16(sto_res_tmp3) iii iteration
+ sqxtun v12.4h, v12.4s //vqmovun_s32(sto_res_tmp1) iv iteration
+
+ //mov v13, v12 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
+ st1 {v10.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration
+ uqxtn v12.8b, v12.8h //vqmovn_u16(sto_res_tmp3) iv iteration
+
+ subs x9,x9,#4 //decrement wd by 4 and check for 0
+ st1 {v12.s}[0],[x6],x3 //store pu1_dst iv iteration
+ bgt core_loop //if greater than 0 repeat the core loop again
+
+end_core_loop:
+ sub x22,x4,x2,lsl #2 //2*src_strd - wd
+ neg x11, x22
+ subs x8,x8,#4 //decrement the ht by 4
+ add x0,x0,x11 //pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
+ asr x9,x4,#1
+ sub x22,x9,x3,lsl #2 //2*dst_strd - wd
+ neg x12, x22
+ add x1,x1,x12 //pu1_dst + dst_std - wd
+ bgt core_loop //if ht is greater than 0 goto outer_loop
+
+end_loops:
+ // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
diff --git a/common/ihevc_buf_mgr.c b/common/ihevc_buf_mgr.c
new file mode 100644
index 0000000..b6e4f2a
--- /dev/null
+++ b/common/ihevc_buf_mgr.c
@@ -0,0 +1,402 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_buf_mgr.c
+*
+* @brief
+* Contains function definitions for buffer management
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_buf_mgr_init()
+* - ihevc_buf_mgr_add()
+* - ihevc_buf_mgr_get_next_free()
+* - ihevc_buf_mgr_check_free()
+* - ihevc_buf_mgr_release()
+* - ihevc_buf_mgr_set_status()
+* - ihevc_buf_mgr_get_status()
+* - ihevc_buf_mgr_get_buf()
+* - ihevc_buf_mgr_get_num_active_buf()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_buf_mgr.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Buffer manager initialization function.
+*
+* @par Description:
+* Initializes the buffer manager structure
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_buf_mgr_init(
+ buf_mgr_t *ps_buf_mgr)
+{
+ WORD32 id;
+
+ ps_buf_mgr->u4_max_buf_cnt = BUF_MGR_MAX_CNT;
+ ps_buf_mgr->u4_active_buf_cnt = 0;
+
+ for(id = 0; id < BUF_MGR_MAX_CNT; id++)
+ {
+ ps_buf_mgr->au4_status[id] = 0;
+ ps_buf_mgr->apv_ptr[id] = NULL;
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Adds and increments the buffer and buffer count.
+*
+* @par Description:
+* Adds a buffer to the buffer manager if it is not already present and
+* increments the active buffer count
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] pv_ptr
+* Pointer to the buffer to be added
+*
+* @returns Returns 0 on success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_add(
+ buf_mgr_t *ps_buf_mgr,
+ void *pv_ptr,
+ WORD32 buf_id)
+{
+
+ /* Check if buffer ID is within allowed range */
+ if(buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+ {
+ return (-1);
+ }
+
+ /* Check if the current ID is being used to hold some other buffer */
+ if((ps_buf_mgr->apv_ptr[buf_id] != NULL) &&
+ (ps_buf_mgr->apv_ptr[buf_id] != pv_ptr))
+ {
+ return (-1);
+ }
+ ps_buf_mgr->apv_ptr[buf_id] = pv_ptr;
+
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the next free buffer.
+*
+* @par Description:
+* Returns the next free buffer available and sets the corresponding status
+* to DEC
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] pi4_buf_id
+* Pointer to the id of the free buffer
+*
+* @returns Pointer to the free buffer
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* ihevc_buf_mgr_get_next_free(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 *pi4_buf_id)
+{
+ WORD32 id;
+ void *pv_ret_ptr;
+
+ pv_ret_ptr = NULL;
+ for(id = 0; id < (WORD32)ps_buf_mgr->u4_max_buf_cnt; id++)
+ {
+ /* Check if the buffer is non-null and status is zero */
+ if((ps_buf_mgr->au4_status[id] == 0) && (ps_buf_mgr->apv_ptr[id]))
+ {
+ *pi4_buf_id = id;
+ /* DEC is set to 1 */
+ ps_buf_mgr->au4_status[id] = 1;
+ pv_ret_ptr = ps_buf_mgr->apv_ptr[id];
+ break;
+ }
+ }
+
+ return pv_ret_ptr;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Checks the buffer manager for free buffers available.
+*
+* @par Description:
+* Checks if there are any free buffers available
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns Returns 0 if available, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_check_free(
+ buf_mgr_t *ps_buf_mgr)
+{
+ UWORD32 id;
+
+ for(id = 0; id < ps_buf_mgr->u4_max_buf_cnt; id++)
+ {
+ if((ps_buf_mgr->au4_status[id] == 0) &&
+ (ps_buf_mgr->apv_ptr[id]))
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Resets the status bits.
+*
+* @par Description:
+* resets the status bits that the mask contains (status corresponding to
+* the id)
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer status to be released
+*
+* @param[in] mask
+* Contains the bits that are to be reset
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_release(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 buf_id,
+ UWORD32 mask)
+{
+ /* If the given id is pointing to an id which is not yet added */
+ if(buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+ {
+ return (-1);
+ }
+
+ ps_buf_mgr->au4_status[buf_id] &= ~mask;
+
+ /* If both the REF and DISP are zero, DEC is set to zero */
+ if(ps_buf_mgr->au4_status[buf_id] == 1)
+ {
+ ps_buf_mgr->au4_status[buf_id] = 0;
+ }
+
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets the status bit.
+*
+* @par Description:
+* sets the status bits that the mask contains (status corresponding to the
+* id)
+*
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer whose status needs to be modified
+*
+*
+* @param[in] mask
+* Contains the bits that are to be set
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_buf_mgr_set_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 buf_id,
+ UWORD32 mask)
+{
+ if(buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+ {
+ return (-1);
+ }
+
+
+ if((ps_buf_mgr->au4_status[buf_id] & mask) != 0)
+ {
+ return (-1);
+ }
+
+ ps_buf_mgr->au4_status[buf_id] |= mask;
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Returns the status of the buffer.
+*
+* @par Description:
+* Returns the status of the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer status required
+*
+* @returns Status of the buffer corresponding to the id
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+UWORD32 ihevc_buf_mgr_get_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 buf_id)
+{
+ return ps_buf_mgr->au4_status[buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the buffer from the buffer manager
+*
+* @par Description:
+* Returns the pointer to the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer required
+*
+* @returns Pointer to the buffer required
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* ihevc_buf_mgr_get_buf(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 buf_id)
+{
+ return ps_buf_mgr->apv_ptr[buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the no.of active buffer
+*
+* @par Description:
+* Return the number of active buffers in the buffer manager
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns number of active buffers
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+UWORD32 ihevc_buf_mgr_get_num_active_buf(
+ buf_mgr_t *ps_buf_mgr)
+{
+ return ps_buf_mgr->u4_max_buf_cnt;
+}
diff --git a/common/ihevc_buf_mgr.h b/common/ihevc_buf_mgr.h
new file mode 100644
index 0000000..7801a5c
--- /dev/null
+++ b/common/ihevc_buf_mgr.h
@@ -0,0 +1,113 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_buf_mgr.h
+*
+* @brief
+* Function declarations used for buffer management
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _BUF_MGR_H_
+#define _BUF_MGR_H_
+
+#define BUF_MGR_MAX_CNT 64
+
+#define BUF_MGR_DEC 1
+#define BUF_MGR_REF (1 << 1)
+#define BUF_MGR_DISP (1 << 2)
+
+typedef struct
+{
+ /**
+ * max_buf_cnt
+ */
+ UWORD32 u4_max_buf_cnt;
+
+ /**
+ * active_buf_cnt
+ */
+ UWORD32 u4_active_buf_cnt;
+ /**
+ * au4_status[BUF_MGR_MAX_CNT]
+ */
+ UWORD32 au4_status[BUF_MGR_MAX_CNT];
+ /* The last three bit of status are: */
+ /* Bit 0 - DEC */
+ /* Bit 1 - REF */
+ /* Bit 2 - DISP */
+
+ void *apv_ptr[BUF_MGR_MAX_CNT];
+}buf_mgr_t;
+
+// intializes the buffer API structure
+void ihevc_buf_mgr_init(
+ buf_mgr_t *ps_buf_mgr);
+
+// Add buffer to buffer manager. 0: success, -1: fail (u4_active_buf_cnt has reached u4_max_buf_cnt)
+WORD32 ihevc_buf_mgr_add(
+ buf_mgr_t *ps_buf_mgr,
+ void *pv_ptr,
+ WORD32 buf_id);
+
+// this function will set the buffer status to DEC
+void* ihevc_buf_mgr_get_next_free(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 *pi4_id);
+
+// this function will check if there are any free buffers
+WORD32 ihevc_buf_mgr_check_free(
+ buf_mgr_t *ps_buf_mgr);
+
+// mask will have who released it: DISP:REF:DEC
+WORD32 ihevc_buf_mgr_release(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id,
+ UWORD32 mask);
+
+// sets the status to one or all of DISP:REF:DEC
+WORD32 ihevc_buf_mgr_set_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id,
+ UWORD32 mask);
+
+// Gets status of the buffer
+UWORD32 ihevc_buf_mgr_get_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id);
+
+// pass the ID - buffer will be returned
+void* ihevc_buf_mgr_get_buf(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id);
+
+// will return number of active buffers
+UWORD32 ihevc_buf_mgr_get_num_active_buf(
+ buf_mgr_t *ps_buf_mgr);
+
+
+
+#endif //_BUF_MGR_H_
diff --git a/common/ihevc_cabac_tables.c b/common/ihevc_cabac_tables.c
new file mode 100644
index 0000000..fb10f3e
--- /dev/null
+++ b/common/ihevc_cabac_tables.c
@@ -0,0 +1,3523 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+
+/**
+******************************************************************************
+* @file
+* ihevc_cabac_tables.c
+*
+* @brief
+* This file contains HEVC cabac tables for init contexts, rlps and
+* cabac state trasnitions
+*
+* @author
+* Ittiam
+*
+* @par List of Tables
+* - gau1_ihevc_cabac_rlps[]
+* - gau1_ihevc_next_state[]
+* - gau1_ihevc_cab_ctxts[]
+*
+******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_cabac_tables.h"
+
+
+/**
+ ******************************************************************************
+ * @brief Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx
+ * input : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3]
+ * output : RLps
+ *
+ * @remarks See Table 9-40 of HEVC spec for rangeTabLPS
+ ******************************************************************************
+ */
+const UWORD8 gau1_ihevc_cabac_rlps[64][4] =
+{
+ { 128, 176, 208, 240 },
+ { 128, 167, 197, 227 },
+ { 128, 158, 187, 216 },
+ { 123, 150, 178, 205 },
+ { 116, 142, 169, 195 },
+ { 111, 135, 160, 185 },
+ { 105, 128, 152, 175 },
+ { 100, 122, 144, 166 },
+ { 95, 116, 137, 158 },
+ { 90, 110, 130, 150 },
+ { 85, 104, 123, 142 },
+ { 81, 99, 117, 135 },
+ { 77, 94, 111, 128 },
+ { 73, 89, 105, 122 },
+ { 69, 85, 100, 116 },
+ { 66, 80, 95, 110 },
+ { 62, 76, 90, 104 },
+ { 59, 72, 86, 99 },
+ { 56, 69, 81, 94 },
+ { 53, 65, 77, 89 },
+ { 51, 62, 73, 85 },
+ { 48, 59, 69, 80 },
+ { 46, 56, 66, 76 },
+ { 43, 53, 63, 72 },
+ { 41, 50, 59, 69 },
+ { 39, 48, 56, 65 },
+ { 37, 45, 54, 62 },
+ { 35, 43, 51, 59 },
+ { 33, 41, 48, 56 },
+ { 32, 39, 46, 53 },
+ { 30, 37, 43, 50 },
+ { 29, 35, 41, 48 },
+ { 27, 33, 39, 45 },
+ { 26, 31, 37, 43 },
+ { 24, 30, 35, 41 },
+ { 23, 28, 33, 39 },
+ { 22, 27, 32, 37 },
+ { 21, 26, 30, 35 },
+ { 20, 24, 29, 33 },
+ { 19, 23, 27, 31 },
+ { 18, 22, 26, 30 },
+ { 17, 21, 25, 28 },
+ { 16, 20, 23, 27 },
+ { 15, 19, 22, 25 },
+ { 14, 18, 21, 24 },
+ { 14, 17, 20, 23 },
+ { 13, 16, 19, 22 },
+ { 12, 15, 18, 21 },
+ { 12, 14, 17, 20 },
+ { 11, 14, 16, 19 },
+ { 11, 13, 15, 18 },
+ { 10, 12, 15, 17 },
+ { 10, 12, 14, 16 },
+ { 9, 11, 13, 15 },
+ { 9, 11, 12, 14 },
+ { 8, 10, 12, 14 },
+ { 8, 9, 11, 13 },
+ { 7, 9, 11, 12 },
+ { 7, 9, 10, 12 },
+ { 7, 8, 10, 11 },
+ { 6, 8, 9, 11 },
+ { 6, 7, 9, 10 },
+ { 6, 7, 8, 9 },
+ { 2, 2, 2, 2 }
+};
+
+
+/**
+ ******************************************************************************
+ * @brief probaility+MPS state transition tables based on cur State and bin
+ * input : curpState[bits7-2] | curMPS[bit1] | decodedBin[bit0]
+ * output : nextpState[bits6-1] | nextMPS[bit0]
+ * @remarks Modified form of Table-9-41 State Transition table in HEVC spec
+ ******************************************************************************
+ */
+const UWORD8 gau1_ihevc_next_state[64 * 2 * 2] =
+{
+/*****************************************************************************/
+/* m=0,b=0 | m=0,b=1 | m=1,b=0 | m=1,b=1 */
+/*****************************************************************************/
+ 2, 1, 0, 3, /* mps reversal for m=0,b=1 / m=1,b=0 */
+ 4, 0, 1, 5,
+ 6, 2, 3, 7,
+ 8, 4, 5, 9,
+ 10, 4, 5, 11,
+ 12, 8, 9, 13,
+ 14, 8, 9, 15,
+ 16, 10, 11, 17,
+ 18, 12, 13, 19,
+ 20, 14, 15, 21,
+ 22, 16, 17, 23,
+ 24, 18, 19, 25,
+ 26, 18, 19, 27,
+ 28, 22, 23, 29,
+ 30, 22, 23, 31,
+ 32, 24, 25, 33,
+ 34, 26, 27, 35,
+ 36, 26, 27, 37,
+ 38, 30, 31, 39,
+ 40, 30, 31, 41,
+ 42, 32, 33, 43,
+ 44, 32, 33, 45,
+ 46, 36, 37, 47,
+ 48, 36, 37, 49,
+ 50, 38, 39, 51,
+ 52, 38, 39, 53,
+ 54, 42, 43, 55,
+ 56, 42, 43, 57,
+ 58, 44, 45, 59,
+ 60, 44, 45, 61,
+ 62, 46, 47, 63,
+ 64, 48, 49, 65,
+ 66, 48, 49, 67,
+ 68, 50, 51, 69,
+ 70, 52, 53, 71,
+ 72, 52, 53, 73,
+ 74, 54, 55, 75,
+ 76, 54, 55, 77,
+ 78, 56, 57, 79,
+ 80, 58, 59, 81,
+ 82, 58, 59, 83,
+ 84, 60, 61, 85,
+ 86, 60, 61, 87,
+ 88, 60, 61, 89,
+ 90, 62, 63, 91,
+ 92, 64, 65, 93,
+ 94, 64, 65, 95,
+ 96, 66, 67, 97,
+ 98, 66, 67, 99,
+ 100, 66, 67, 101,
+ 102, 68, 69, 103,
+ 104, 68, 69, 105,
+ 106, 70, 71, 107,
+ 108, 70, 71, 109,
+ 110, 70, 71, 111,
+ 112, 72, 73, 113,
+ 114, 72, 73, 115,
+ 116, 72, 73, 117,
+ 118, 74, 75, 119,
+ 120, 74, 75, 121,
+ 122, 74, 75, 123,
+ 124, 76, 77, 125,
+ 124, 76, 77, 125,
+ 126, 126, 127, 127
+};
+
+
+/*
+******************************************************************************
+* As per HEVC standard the Cabac values are generated using following logic
+* (ref: section 9.3.1.1 of JCTVC-J1003_d7_DIS)
+* From the 8 bit table entry initValue, the two 4 bit variables slopeIdx and
+* intersecIdx are derived according to the following pseudo-code process
+* slopeIdx = initValue >> 4
+* intersecIdx = initValue & 15
+*
+* Slope m and Intersec n are derived from the indices as follows:
+* m = slopeIdx*5 - 45
+* n = ( intersecIdx << 3 ) - 16
+*
+* The two values assigned to pStateIdx and valMPS for the initialization
+* are derived from SliceQPY, which is derived in Equation 7 35.
+*
+* Given the variable m and n, the initialization is specified by the following
+* pseudo-code process
+*
+* preCtxState = Clip3( 1, 126, ( ( m * Clip3( 0, 51, SliceQPY ) ) >> 4 ) + n )
+* valMPS = ( preCtxState <= 63) ? 0 : 1
+* pStateIdx = valMPS ? (preCtxState - 64) : (63 - preCtxState)
+******************************************************************************
+*/
+
+/**
+ ******************************************************************************
+ * @brief Init context tables for all combinations of qp and cabac_init_idc
+ * @remarks Packing format MPS in lsb and pState in bits[1-6]
+ ******************************************************************************
+ */
+
+const UWORD8 gau1_ihevc_cab_ctxts[IHEVC_NUM_CAB_IDC][IHEVC_MAX_QP][IHEVC_CAB_CTXT_END] =
+{
+ {
+ {
+ /* Context Tables for init_idc = 0, qp = 0 */
+
+ 14, 30, 17, 49, 49, 1, 81, 81, 81, 1,
+ 1, 81, 30, 81, 81, 81, 30, 81, 81, 81,
+ 81, 81, 81, 81, 81, 81, 81, 81, 81, 81,
+ 81, 14, 1, 1, 81, 49, 65, 1, 62, 1,
+ 17, 17, 65, 65, 33, 49, 33, 14, 49, 81,
+ 33, 49, 81, 81, 81, 81, 81, 33, 17, 81,
+ 65, 65, 33, 49, 33, 14, 49, 81, 33, 49,
+ 81, 81, 81, 81, 81, 33, 17, 81, 17, 17,
+ 62, 49, 81, 81, 49, 65, 65, 65, 33, 33,
+ 33, 17, 49, 49, 110, 14, 49, 17, 49, 49,
+ 110, 14, 49, 17, 49, 49, 110, 14, 49, 33,
+ 17, 62, 62, 30, 30, 30, 30, 14, 30, 17,
+ 81, 30, 17, 81, 33, 33, 14, 1, 33, 30,
+ 1, 17, 14, 1, 78, 33, 17, 17, 1, 30,
+ 33, 110, 62, 62, 33, 110, 1, 78, 1, 14,
+ 30, 46, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 1 */
+
+ 14, 30, 15, 47, 49, 1, 83, 83, 83, 1,
+ 1, 83, 30, 83, 83, 83, 30, 77, 83, 83,
+ 83, 83, 83, 83, 83, 83, 83, 83, 83, 83,
+ 83, 14, 0, 0, 79, 47, 61, 0, 62, 1,
+ 15, 15, 63, 63, 31, 47, 31, 14, 47, 79,
+ 31, 47, 79, 79, 79, 79, 77, 31, 15, 77,
+ 63, 63, 31, 47, 31, 14, 47, 79, 31, 47,
+ 79, 79, 79, 79, 77, 31, 15, 77, 13, 17,
+ 64, 47, 79, 79, 47, 63, 63, 61, 31, 31,
+ 31, 15, 47, 47, 110, 14, 47, 15, 47, 47,
+ 110, 14, 47, 15, 47, 47, 110, 14, 47, 31,
+ 15, 62, 62, 30, 32, 30, 32, 14, 32, 15,
+ 79, 32, 15, 79, 31, 29, 16, 0, 31, 30,
+ 0, 15, 14, 2, 78, 29, 15, 15, 0, 30,
+ 31, 110, 62, 62, 31, 108, 0, 78, 0, 14,
+ 32, 46, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 2 */
+
+ 14, 28, 15, 47, 49, 1, 87, 87, 87, 1,
+ 1, 87, 28, 87, 87, 87, 28, 73, 87, 87,
+ 87, 87, 87, 87, 87, 87, 87, 87, 87, 87,
+ 87, 14, 0, 0, 77, 47, 59, 0, 60, 1,
+ 15, 15, 61, 61, 29, 45, 31, 14, 45, 77,
+ 31, 45, 77, 79, 77, 77, 73, 29, 13, 73,
+ 61, 61, 29, 45, 31, 14, 45, 77, 31, 45,
+ 77, 79, 77, 77, 73, 29, 13, 73, 11, 17,
+ 64, 47, 77, 77, 45, 61, 61, 59, 29, 29,
+ 29, 13, 45, 47, 108, 14, 45, 13, 45, 47,
+ 108, 14, 45, 13, 45, 47, 108, 14, 45, 31,
+ 15, 60, 60, 30, 32, 30, 32, 14, 32, 15,
+ 77, 32, 15, 77, 31, 27, 16, 0, 31, 30,
+ 0, 15, 14, 6, 78, 27, 15, 13, 2, 30,
+ 31, 108, 62, 60, 31, 104, 2, 76, 0, 14,
+ 32, 46, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 3 */
+
+ 14, 26, 15, 47, 49, 1, 91, 91, 91, 1,
+ 1, 91, 28, 91, 91, 91, 28, 69, 91, 91,
+ 91, 91, 91, 91, 91, 91, 91, 91, 91, 91,
+ 91, 14, 0, 0, 75, 47, 57, 0, 60, 1,
+ 15, 15, 59, 59, 29, 45, 31, 14, 45, 77,
+ 31, 43, 75, 79, 77, 75, 71, 27, 13, 69,
+ 59, 59, 29, 45, 31, 14, 45, 77, 31, 43,
+ 75, 79, 77, 75, 71, 27, 13, 69, 9, 17,
+ 64, 47, 75, 75, 45, 59, 59, 57, 29, 27,
+ 29, 11, 45, 47, 108, 14, 45, 11, 45, 47,
+ 108, 14, 45, 11, 45, 47, 108, 14, 45, 31,
+ 15, 60, 60, 30, 32, 30, 32, 14, 32, 15,
+ 75, 32, 15, 75, 31, 25, 16, 0, 31, 30,
+ 0, 15, 14, 8, 78, 25, 15, 11, 2, 30,
+ 31, 108, 62, 60, 31, 102, 2, 74, 0, 14,
+ 32, 46, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 4 */
+
+ 14, 24, 13, 45, 49, 1, 95, 95, 95, 1,
+ 1, 95, 26, 95, 95, 95, 26, 65, 95, 95,
+ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
+ 95, 14, 2, 2, 73, 45, 55, 2, 58, 1,
+ 13, 13, 57, 57, 27, 43, 29, 14, 43, 75,
+ 29, 41, 73, 77, 75, 73, 67, 25, 11, 65,
+ 57, 57, 27, 43, 29, 14, 43, 75, 29, 41,
+ 73, 77, 75, 73, 67, 25, 11, 65, 7, 19,
+ 66, 45, 73, 73, 43, 57, 57, 55, 27, 25,
+ 27, 9, 43, 45, 106, 14, 43, 9, 43, 45,
+ 106, 14, 43, 9, 43, 45, 106, 14, 43, 29,
+ 13, 58, 58, 30, 34, 30, 34, 14, 34, 13,
+ 73, 34, 13, 73, 29, 23, 18, 2, 29, 30,
+ 2, 13, 14, 12, 78, 23, 13, 9, 4, 30,
+ 29, 106, 60, 58, 29, 98, 4, 72, 2, 14,
+ 34, 44, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 5 */
+
+ 14, 22, 13, 45, 49, 1, 99, 99, 99, 1,
+ 1, 99, 24, 99, 99, 99, 24, 61, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 14, 2, 2, 71, 45, 51, 2, 56, 1,
+ 13, 13, 55, 55, 25, 41, 29, 14, 41, 73,
+ 29, 39, 71, 77, 73, 71, 65, 23, 9, 61,
+ 55, 55, 25, 41, 29, 14, 41, 73, 29, 39,
+ 71, 77, 73, 71, 65, 23, 9, 61, 3, 19,
+ 66, 45, 71, 71, 41, 55, 55, 51, 25, 23,
+ 25, 7, 41, 45, 104, 14, 41, 7, 41, 45,
+ 104, 14, 41, 7, 41, 45, 104, 14, 41, 29,
+ 13, 56, 56, 30, 34, 30, 34, 14, 34, 13,
+ 71, 34, 13, 71, 29, 19, 18, 2, 29, 30,
+ 2, 13, 14, 14, 78, 19, 13, 7, 6, 30,
+ 29, 104, 60, 56, 29, 96, 6, 70, 2, 14,
+ 34, 44, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 6 */
+
+ 14, 20, 13, 45, 49, 1, 103, 103, 103, 1,
+ 1, 103, 24, 103, 103, 103, 24, 57, 103, 103,
+ 103, 103, 103, 103, 103, 103, 103, 103, 103, 103,
+ 103, 14, 2, 2, 69, 45, 49, 2, 56, 1,
+ 13, 13, 53, 53, 25, 41, 29, 14, 41, 73,
+ 29, 37, 69, 77, 73, 69, 61, 21, 9, 57,
+ 53, 53, 25, 41, 29, 14, 41, 73, 29, 37,
+ 69, 77, 73, 69, 61, 21, 9, 57, 1, 19,
+ 66, 45, 69, 69, 41, 53, 53, 49, 25, 21,
+ 25, 5, 41, 45, 104, 14, 41, 5, 41, 45,
+ 104, 14, 41, 5, 41, 45, 104, 14, 41, 29,
+ 13, 56, 56, 30, 34, 30, 34, 14, 34, 13,
+ 69, 34, 13, 69, 29, 17, 18, 2, 29, 30,
+ 2, 13, 14, 18, 78, 17, 13, 5, 6, 30,
+ 29, 104, 60, 56, 29, 92, 6, 68, 2, 14,
+ 34, 44, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 7 */
+
+ 14, 18, 11, 43, 49, 1, 107, 107, 107, 1,
+ 1, 107, 22, 107, 107, 107, 22, 53, 107, 107,
+ 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
+ 107, 14, 4, 4, 67, 43, 47, 4, 54, 1,
+ 11, 11, 51, 51, 23, 39, 27, 14, 39, 71,
+ 27, 35, 67, 75, 71, 67, 59, 19, 7, 53,
+ 51, 51, 23, 39, 27, 14, 39, 71, 27, 35,
+ 67, 75, 71, 67, 59, 19, 7, 53, 0, 21,
+ 68, 43, 67, 67, 39, 51, 51, 47, 23, 19,
+ 23, 3, 39, 43, 102, 14, 39, 3, 39, 43,
+ 102, 14, 39, 3, 39, 43, 102, 14, 39, 27,
+ 11, 54, 54, 30, 36, 30, 36, 14, 36, 11,
+ 67, 36, 11, 67, 27, 15, 20, 4, 27, 30,
+ 4, 11, 14, 20, 78, 15, 11, 3, 8, 30,
+ 27, 102, 58, 54, 27, 90, 8, 66, 4, 14,
+ 36, 42, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 8 */
+
+ 14, 16, 11, 43, 49, 1, 111, 111, 111, 1,
+ 1, 111, 20, 111, 111, 111, 20, 51, 111, 111,
+ 111, 111, 111, 111, 111, 111, 111, 111, 111, 111,
+ 111, 14, 4, 4, 65, 43, 45, 4, 52, 1,
+ 11, 11, 49, 49, 23, 39, 27, 14, 39, 71,
+ 27, 33, 65, 75, 71, 65, 55, 17, 7, 51,
+ 49, 49, 23, 39, 27, 14, 39, 71, 27, 33,
+ 65, 75, 71, 65, 55, 17, 7, 51, 2, 21,
+ 68, 43, 65, 65, 39, 49, 49, 45, 23, 17,
+ 23, 1, 39, 43, 100, 14, 39, 1, 39, 43,
+ 100, 14, 39, 1, 39, 43, 100, 14, 39, 27,
+ 11, 52, 52, 30, 36, 30, 36, 14, 36, 11,
+ 65, 36, 11, 65, 27, 13, 20, 4, 27, 30,
+ 4, 11, 14, 24, 78, 13, 11, 1, 8, 30,
+ 27, 100, 58, 52, 27, 86, 8, 64, 4, 14,
+ 36, 42, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 9 */
+
+ 14, 14, 11, 43, 49, 1, 113, 113, 113, 1,
+ 1, 113, 20, 113, 113, 113, 20, 47, 113, 113,
+ 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+ 113, 14, 4, 4, 63, 43, 41, 4, 52, 1,
+ 11, 11, 47, 47, 21, 37, 27, 14, 37, 69,
+ 27, 31, 63, 75, 69, 63, 51, 15, 5, 47,
+ 47, 47, 21, 37, 27, 14, 37, 69, 27, 31,
+ 63, 75, 69, 63, 51, 15, 5, 47, 6, 21,
+ 68, 43, 63, 63, 37, 47, 47, 41, 21, 15,
+ 21, 0, 37, 43, 100, 14, 37, 0, 37, 43,
+ 100, 14, 37, 0, 37, 43, 100, 14, 37, 27,
+ 11, 52, 52, 30, 36, 30, 36, 14, 36, 11,
+ 63, 36, 11, 63, 27, 9, 20, 4, 27, 30,
+ 4, 11, 14, 28, 78, 9, 11, 0, 10, 30,
+ 27, 100, 58, 52, 27, 82, 10, 62, 4, 14,
+ 36, 42, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 10 */
+
+ 14, 12, 9, 41, 49, 1, 117, 117, 117, 1,
+ 1, 117, 18, 117, 117, 117, 18, 43, 117, 117,
+ 117, 117, 117, 117, 117, 117, 117, 117, 117, 117,
+ 117, 14, 6, 6, 61, 41, 39, 6, 50, 1,
+ 9, 9, 45, 45, 19, 35, 25, 14, 35, 67,
+ 25, 29, 61, 73, 67, 61, 49, 13, 3, 43,
+ 45, 45, 19, 35, 25, 14, 35, 67, 25, 29,
+ 61, 73, 67, 61, 49, 13, 3, 43, 8, 23,
+ 70, 41, 61, 61, 35, 45, 45, 39, 19, 13,
+ 19, 2, 35, 41, 98, 14, 35, 2, 35, 41,
+ 98, 14, 35, 2, 35, 41, 98, 14, 35, 25,
+ 9, 50, 50, 30, 38, 30, 38, 14, 38, 9,
+ 61, 38, 9, 61, 25, 7, 22, 6, 25, 30,
+ 6, 9, 14, 30, 78, 7, 9, 2, 12, 30,
+ 25, 98, 56, 50, 25, 80, 12, 60, 6, 14,
+ 38, 40, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 11 */
+
+ 14, 10, 9, 41, 49, 1, 121, 121, 121, 1,
+ 1, 121, 18, 121, 121, 121, 18, 39, 121, 121,
+ 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+ 121, 14, 6, 6, 59, 41, 37, 6, 50, 1,
+ 9, 9, 43, 43, 19, 35, 25, 14, 35, 67,
+ 25, 27, 59, 73, 67, 59, 45, 11, 3, 39,
+ 43, 43, 19, 35, 25, 14, 35, 67, 25, 27,
+ 59, 73, 67, 59, 45, 11, 3, 39, 10, 23,
+ 70, 41, 59, 59, 35, 43, 43, 37, 19, 11,
+ 19, 4, 35, 41, 98, 14, 35, 4, 35, 41,
+ 98, 14, 35, 4, 35, 41, 98, 14, 35, 25,
+ 9, 50, 50, 30, 38, 30, 38, 14, 38, 9,
+ 59, 38, 9, 59, 25, 5, 22, 6, 25, 30,
+ 6, 9, 14, 34, 78, 5, 9, 4, 12, 30,
+ 25, 98, 56, 50, 25, 76, 12, 58, 6, 14,
+ 38, 40, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 12 */
+
+ 14, 8, 9, 41, 49, 1, 125, 125, 125, 1,
+ 1, 125, 16, 125, 125, 125, 16, 35, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 6, 6, 57, 41, 35, 6, 48, 1,
+ 9, 9, 41, 41, 17, 33, 25, 14, 33, 65,
+ 25, 25, 57, 73, 65, 57, 43, 9, 1, 35,
+ 41, 41, 17, 33, 25, 14, 33, 65, 25, 25,
+ 57, 73, 65, 57, 43, 9, 1, 35, 12, 23,
+ 70, 41, 57, 57, 33, 41, 41, 35, 17, 9,
+ 17, 6, 33, 41, 96, 14, 33, 6, 33, 41,
+ 96, 14, 33, 6, 33, 41, 96, 14, 33, 25,
+ 9, 48, 48, 30, 38, 30, 38, 14, 38, 9,
+ 57, 38, 9, 57, 25, 3, 22, 6, 25, 30,
+ 6, 9, 14, 36, 78, 3, 9, 6, 14, 30,
+ 25, 96, 56, 48, 25, 74, 14, 56, 6, 14,
+ 38, 40, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 13 */
+
+ 14, 6, 7, 39, 49, 1, 125, 125, 125, 1,
+ 1, 125, 14, 125, 125, 125, 14, 31, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 8, 8, 55, 39, 31, 8, 46, 1,
+ 7, 7, 39, 39, 15, 31, 23, 14, 31, 63,
+ 23, 23, 55, 71, 63, 55, 39, 7, 0, 31,
+ 39, 39, 15, 31, 23, 14, 31, 63, 23, 23,
+ 55, 71, 63, 55, 39, 7, 0, 31, 16, 25,
+ 72, 39, 55, 55, 31, 39, 39, 31, 15, 7,
+ 15, 8, 31, 39, 94, 14, 31, 8, 31, 39,
+ 94, 14, 31, 8, 31, 39, 94, 14, 31, 23,
+ 7, 46, 46, 30, 40, 30, 40, 14, 40, 7,
+ 55, 40, 7, 55, 23, 0, 24, 8, 23, 30,
+ 8, 7, 14, 40, 78, 0, 7, 8, 16, 30,
+ 23, 94, 54, 46, 23, 70, 16, 54, 8, 14,
+ 40, 38, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 14 */
+
+ 14, 4, 7, 39, 49, 1, 125, 125, 125, 1,
+ 1, 125, 14, 125, 125, 125, 14, 27, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 8, 8, 53, 39, 29, 8, 46, 1,
+ 7, 7, 37, 37, 15, 31, 23, 14, 31, 63,
+ 23, 21, 53, 71, 63, 53, 37, 5, 0, 27,
+ 37, 37, 15, 31, 23, 14, 31, 63, 23, 21,
+ 53, 71, 63, 53, 37, 5, 0, 27, 18, 25,
+ 72, 39, 53, 53, 31, 37, 37, 29, 15, 5,
+ 15, 10, 31, 39, 94, 14, 31, 10, 31, 39,
+ 94, 14, 31, 10, 31, 39, 94, 14, 31, 23,
+ 7, 46, 46, 30, 40, 30, 40, 14, 40, 7,
+ 53, 40, 7, 53, 23, 2, 24, 8, 23, 30,
+ 8, 7, 14, 42, 78, 2, 7, 10, 16, 30,
+ 23, 94, 54, 46, 23, 68, 16, 52, 8, 14,
+ 40, 38, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 15 */
+
+ 14, 2, 7, 39, 49, 1, 125, 125, 125, 1,
+ 1, 125, 12, 125, 125, 125, 12, 23, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 8, 8, 51, 39, 27, 8, 44, 1,
+ 7, 7, 35, 35, 13, 29, 23, 14, 29, 61,
+ 23, 19, 51, 71, 61, 51, 33, 3, 2, 23,
+ 35, 35, 13, 29, 23, 14, 29, 61, 23, 19,
+ 51, 71, 61, 51, 33, 3, 2, 23, 20, 25,
+ 72, 39, 51, 51, 29, 35, 35, 27, 13, 3,
+ 13, 12, 29, 39, 92, 14, 29, 12, 29, 39,
+ 92, 14, 29, 12, 29, 39, 92, 14, 29, 23,
+ 7, 44, 44, 30, 40, 30, 40, 14, 40, 7,
+ 51, 40, 7, 51, 23, 4, 24, 8, 23, 30,
+ 8, 7, 14, 46, 78, 4, 7, 12, 18, 30,
+ 23, 92, 54, 44, 23, 64, 18, 50, 8, 14,
+ 40, 38, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 16 */
+
+ 14, 0, 7, 39, 49, 1, 125, 125, 125, 1,
+ 1, 125, 10, 125, 125, 125, 10, 21, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 8, 8, 51, 39, 25, 8, 42, 1,
+ 7, 7, 35, 35, 13, 29, 23, 14, 29, 61,
+ 23, 19, 51, 71, 61, 51, 31, 3, 2, 21,
+ 35, 35, 13, 29, 23, 14, 29, 61, 23, 19,
+ 51, 71, 61, 51, 31, 3, 2, 21, 22, 27,
+ 72, 39, 51, 51, 29, 35, 35, 25, 13, 3,
+ 13, 12, 29, 39, 90, 14, 29, 12, 29, 39,
+ 90, 14, 29, 12, 29, 39, 90, 14, 29, 23,
+ 7, 42, 42, 30, 40, 30, 40, 14, 40, 7,
+ 51, 40, 7, 51, 23, 6, 24, 8, 23, 30,
+ 8, 7, 14, 48, 78, 6, 7, 12, 18, 30,
+ 23, 90, 52, 42, 23, 60, 18, 48, 8, 14,
+ 40, 36, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 17 */
+
+ 14, 0, 5, 37, 49, 1, 125, 125, 125, 1,
+ 1, 125, 10, 125, 125, 125, 10, 17, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 10, 10, 49, 37, 21, 10, 42, 1,
+ 5, 5, 33, 33, 11, 27, 21, 14, 27, 59,
+ 21, 17, 49, 69, 59, 49, 27, 1, 4, 17,
+ 33, 33, 11, 27, 21, 14, 27, 59, 21, 17,
+ 49, 69, 59, 49, 27, 1, 4, 17, 26, 27,
+ 74, 37, 49, 49, 27, 33, 33, 21, 11, 1,
+ 11, 14, 27, 37, 90, 14, 27, 14, 27, 37,
+ 90, 14, 27, 14, 27, 37, 90, 14, 27, 21,
+ 5, 42, 42, 30, 42, 30, 42, 14, 42, 5,
+ 49, 42, 5, 49, 21, 10, 26, 10, 21, 30,
+ 10, 5, 14, 52, 78, 10, 5, 14, 20, 30,
+ 21, 90, 52, 42, 21, 58, 20, 48, 10, 14,
+ 42, 36, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 18 */
+
+ 14, 1, 5, 37, 49, 1, 125, 125, 125, 1,
+ 1, 125, 8, 125, 125, 125, 8, 13, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 10, 10, 47, 37, 19, 10, 40, 1,
+ 5, 5, 31, 31, 9, 25, 21, 14, 25, 57,
+ 21, 15, 47, 69, 57, 47, 23, 0, 6, 13,
+ 31, 31, 9, 25, 21, 14, 25, 57, 21, 15,
+ 47, 69, 57, 47, 23, 0, 6, 13, 28, 27,
+ 74, 37, 47, 47, 25, 31, 31, 19, 9, 0,
+ 9, 16, 25, 37, 88, 14, 25, 16, 25, 37,
+ 88, 14, 25, 16, 25, 37, 88, 14, 25, 21,
+ 5, 40, 40, 30, 42, 30, 42, 14, 42, 5,
+ 47, 42, 5, 47, 21, 12, 26, 10, 21, 30,
+ 10, 5, 14, 56, 78, 12, 5, 16, 22, 30,
+ 21, 88, 52, 40, 21, 54, 22, 46, 10, 14,
+ 42, 36, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 19 */
+
+ 14, 3, 5, 37, 49, 1, 125, 125, 125, 1,
+ 1, 125, 8, 125, 125, 125, 8, 9, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 10, 10, 45, 37, 17, 10, 40, 1,
+ 5, 5, 29, 29, 9, 25, 21, 14, 25, 57,
+ 21, 13, 45, 69, 57, 45, 21, 2, 6, 9,
+ 29, 29, 9, 25, 21, 14, 25, 57, 21, 13,
+ 45, 69, 57, 45, 21, 2, 6, 9, 30, 27,
+ 74, 37, 45, 45, 25, 29, 29, 17, 9, 2,
+ 9, 18, 25, 37, 88, 14, 25, 18, 25, 37,
+ 88, 14, 25, 18, 25, 37, 88, 14, 25, 21,
+ 5, 40, 40, 30, 42, 30, 42, 14, 42, 5,
+ 45, 42, 5, 45, 21, 14, 26, 10, 21, 30,
+ 10, 5, 14, 58, 78, 14, 5, 18, 22, 30,
+ 21, 88, 52, 40, 21, 52, 22, 44, 10, 14,
+ 42, 36, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 20 */
+
+ 14, 5, 3, 35, 49, 1, 125, 125, 125, 1,
+ 1, 125, 6, 125, 125, 125, 6, 5, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 12, 12, 43, 35, 15, 12, 38, 1,
+ 3, 3, 27, 27, 7, 23, 19, 14, 23, 55,
+ 19, 11, 43, 67, 55, 43, 17, 4, 8, 5,
+ 27, 27, 7, 23, 19, 14, 23, 55, 19, 11,
+ 43, 67, 55, 43, 17, 4, 8, 5, 32, 29,
+ 76, 35, 43, 43, 23, 27, 27, 15, 7, 4,
+ 7, 20, 23, 35, 86, 14, 23, 20, 23, 35,
+ 86, 14, 23, 20, 23, 35, 86, 14, 23, 19,
+ 3, 38, 38, 30, 44, 30, 44, 14, 44, 3,
+ 43, 44, 3, 43, 19, 16, 28, 12, 19, 30,
+ 12, 3, 14, 62, 78, 16, 3, 20, 24, 30,
+ 19, 86, 50, 38, 19, 48, 24, 42, 12, 14,
+ 44, 34, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 21 */
+
+ 14, 7, 3, 35, 49, 1, 125, 125, 125, 1,
+ 1, 125, 4, 125, 125, 125, 4, 1, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 12, 12, 41, 35, 11, 12, 36, 1,
+ 3, 3, 25, 25, 5, 21, 19, 14, 21, 53,
+ 19, 9, 41, 67, 53, 41, 15, 6, 10, 1,
+ 25, 25, 5, 21, 19, 14, 21, 53, 19, 9,
+ 41, 67, 53, 41, 15, 6, 10, 1, 36, 29,
+ 76, 35, 41, 41, 21, 25, 25, 11, 5, 6,
+ 5, 22, 21, 35, 84, 14, 21, 22, 21, 35,
+ 84, 14, 21, 22, 21, 35, 84, 14, 21, 19,
+ 3, 36, 36, 30, 44, 30, 44, 14, 44, 3,
+ 41, 44, 3, 41, 19, 20, 28, 12, 19, 30,
+ 12, 3, 14, 64, 78, 20, 3, 22, 26, 30,
+ 19, 84, 50, 36, 19, 46, 26, 40, 12, 14,
+ 44, 34, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 22 */
+
+ 14, 9, 3, 35, 49, 1, 125, 125, 125, 1,
+ 1, 125, 4, 125, 125, 125, 4, 2, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 12, 12, 39, 35, 9, 12, 36, 1,
+ 3, 3, 23, 23, 5, 21, 19, 14, 21, 53,
+ 19, 7, 39, 67, 53, 39, 11, 8, 10, 2,
+ 23, 23, 5, 21, 19, 14, 21, 53, 19, 7,
+ 39, 67, 53, 39, 11, 8, 10, 2, 38, 29,
+ 76, 35, 39, 39, 21, 23, 23, 9, 5, 8,
+ 5, 24, 21, 35, 84, 14, 21, 24, 21, 35,
+ 84, 14, 21, 24, 21, 35, 84, 14, 21, 19,
+ 3, 36, 36, 30, 44, 30, 44, 14, 44, 3,
+ 39, 44, 3, 39, 19, 22, 28, 12, 19, 30,
+ 12, 3, 14, 68, 78, 22, 3, 24, 26, 30,
+ 19, 84, 50, 36, 19, 42, 26, 38, 12, 14,
+ 44, 34, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 23 */
+
+ 14, 11, 1, 33, 49, 1, 125, 125, 125, 1,
+ 1, 125, 2, 125, 125, 125, 2, 6, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 14, 14, 37, 33, 7, 14, 34, 1,
+ 1, 1, 21, 21, 3, 19, 17, 14, 19, 51,
+ 17, 5, 37, 65, 51, 37, 9, 10, 12, 6,
+ 21, 21, 3, 19, 17, 14, 19, 51, 17, 5,
+ 37, 65, 51, 37, 9, 10, 12, 6, 40, 31,
+ 78, 33, 37, 37, 19, 21, 21, 7, 3, 10,
+ 3, 26, 19, 33, 82, 14, 19, 26, 19, 33,
+ 82, 14, 19, 26, 19, 33, 82, 14, 19, 17,
+ 1, 34, 34, 30, 46, 30, 46, 14, 46, 1,
+ 37, 46, 1, 37, 17, 24, 30, 14, 17, 30,
+ 14, 1, 14, 70, 78, 24, 1, 26, 28, 30,
+ 17, 82, 48, 34, 17, 40, 28, 36, 14, 14,
+ 46, 32, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 24 */
+
+ 14, 13, 1, 33, 49, 1, 125, 125, 125, 1,
+ 1, 125, 0, 125, 125, 125, 0, 8, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 14, 14, 35, 33, 5, 14, 32, 1,
+ 1, 1, 19, 19, 3, 19, 17, 14, 19, 51,
+ 17, 3, 35, 65, 51, 35, 5, 12, 12, 8,
+ 19, 19, 3, 19, 17, 14, 19, 51, 17, 3,
+ 35, 65, 51, 35, 5, 12, 12, 8, 42, 31,
+ 78, 33, 35, 35, 19, 19, 19, 5, 3, 12,
+ 3, 28, 19, 33, 80, 14, 19, 28, 19, 33,
+ 80, 14, 19, 28, 19, 33, 80, 14, 19, 17,
+ 1, 32, 32, 30, 46, 30, 46, 14, 46, 1,
+ 35, 46, 1, 35, 17, 26, 30, 14, 17, 30,
+ 14, 1, 14, 74, 78, 26, 1, 28, 28, 30,
+ 17, 80, 48, 32, 17, 36, 28, 34, 14, 14,
+ 46, 32, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 25 */
+
+ 14, 15, 1, 33, 49, 1, 125, 125, 125, 1,
+ 1, 125, 0, 125, 125, 125, 0, 12, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 14, 14, 33, 33, 1, 14, 32, 1,
+ 1, 1, 17, 17, 1, 17, 17, 14, 17, 49,
+ 17, 1, 33, 65, 49, 33, 1, 14, 14, 12,
+ 17, 17, 1, 17, 17, 14, 17, 49, 17, 1,
+ 33, 65, 49, 33, 1, 14, 14, 12, 46, 31,
+ 78, 33, 33, 33, 17, 17, 17, 1, 1, 14,
+ 1, 30, 17, 33, 80, 14, 17, 30, 17, 33,
+ 80, 14, 17, 30, 17, 33, 80, 14, 17, 17,
+ 1, 32, 32, 30, 46, 30, 46, 14, 46, 1,
+ 33, 46, 1, 33, 17, 30, 30, 14, 17, 30,
+ 14, 1, 14, 78, 78, 30, 1, 30, 30, 30,
+ 17, 80, 48, 32, 17, 32, 30, 32, 14, 14,
+ 46, 32, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 26 */
+
+ 14, 17, 0, 31, 49, 1, 125, 125, 125, 1,
+ 1, 125, 1, 125, 125, 125, 1, 16, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 16, 16, 31, 31, 0, 16, 30, 1,
+ 0, 0, 15, 15, 0, 15, 15, 14, 15, 47,
+ 15, 0, 31, 63, 47, 31, 0, 16, 16, 16,
+ 15, 15, 0, 15, 15, 14, 15, 47, 15, 0,
+ 31, 63, 47, 31, 0, 16, 16, 16, 48, 33,
+ 80, 31, 31, 31, 15, 15, 15, 0, 0, 16,
+ 0, 32, 15, 31, 78, 14, 15, 32, 15, 31,
+ 78, 14, 15, 32, 15, 31, 78, 14, 15, 15,
+ 0, 30, 30, 30, 48, 30, 48, 14, 48, 0,
+ 31, 48, 0, 31, 15, 32, 32, 16, 15, 30,
+ 16, 0, 14, 80, 78, 32, 0, 32, 32, 30,
+ 15, 78, 46, 30, 15, 30, 32, 30, 16, 14,
+ 48, 30, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 27 */
+
+ 14, 19, 0, 31, 49, 1, 125, 125, 125, 1,
+ 1, 125, 1, 125, 125, 125, 1, 20, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 16, 16, 29, 31, 2, 16, 30, 1,
+ 0, 0, 13, 13, 0, 15, 15, 14, 15, 47,
+ 15, 2, 29, 63, 47, 29, 4, 18, 16, 20,
+ 13, 13, 0, 15, 15, 14, 15, 47, 15, 2,
+ 29, 63, 47, 29, 4, 18, 16, 20, 50, 33,
+ 80, 31, 29, 29, 15, 13, 13, 2, 0, 18,
+ 0, 34, 15, 31, 78, 14, 15, 34, 15, 31,
+ 78, 14, 15, 34, 15, 31, 78, 14, 15, 15,
+ 0, 30, 30, 30, 48, 30, 48, 14, 48, 0,
+ 29, 48, 0, 29, 15, 34, 32, 16, 15, 30,
+ 16, 0, 14, 84, 78, 34, 0, 34, 32, 30,
+ 15, 78, 46, 30, 15, 26, 32, 28, 16, 14,
+ 48, 30, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 28 */
+
+ 14, 21, 0, 31, 49, 1, 125, 125, 125, 1,
+ 1, 125, 3, 125, 125, 125, 3, 24, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 16, 16, 27, 31, 4, 16, 28, 1,
+ 0, 0, 11, 11, 2, 13, 15, 14, 13, 45,
+ 15, 4, 27, 63, 45, 27, 6, 20, 18, 24,
+ 11, 11, 2, 13, 15, 14, 13, 45, 15, 4,
+ 27, 63, 45, 27, 6, 20, 18, 24, 52, 33,
+ 80, 31, 27, 27, 13, 11, 11, 4, 2, 20,
+ 2, 36, 13, 31, 76, 14, 13, 36, 13, 31,
+ 76, 14, 13, 36, 13, 31, 76, 14, 13, 15,
+ 0, 28, 28, 30, 48, 30, 48, 14, 48, 0,
+ 27, 48, 0, 27, 15, 36, 32, 16, 15, 30,
+ 16, 0, 14, 86, 78, 36, 0, 36, 34, 30,
+ 15, 76, 46, 28, 15, 24, 34, 26, 16, 14,
+ 48, 30, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 29 */
+
+ 14, 23, 2, 29, 49, 1, 125, 125, 125, 1,
+ 1, 125, 5, 125, 125, 125, 5, 28, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 18, 18, 25, 29, 8, 18, 26, 1,
+ 2, 2, 9, 9, 4, 11, 13, 14, 11, 43,
+ 13, 6, 25, 61, 43, 25, 10, 22, 20, 28,
+ 9, 9, 4, 11, 13, 14, 11, 43, 13, 6,
+ 25, 61, 43, 25, 10, 22, 20, 28, 56, 35,
+ 82, 29, 25, 25, 11, 9, 9, 8, 4, 22,
+ 4, 38, 11, 29, 74, 14, 11, 38, 11, 29,
+ 74, 14, 11, 38, 11, 29, 74, 14, 11, 13,
+ 2, 26, 26, 30, 50, 30, 50, 14, 50, 2,
+ 25, 50, 2, 25, 13, 40, 34, 18, 13, 30,
+ 18, 2, 14, 90, 78, 40, 2, 38, 36, 30,
+ 13, 74, 44, 26, 13, 20, 36, 24, 18, 14,
+ 50, 28, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 30 */
+
+ 14, 25, 2, 29, 49, 1, 125, 125, 125, 1,
+ 1, 125, 5, 125, 125, 125, 5, 32, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 18, 18, 23, 29, 10, 18, 26, 1,
+ 2, 2, 7, 7, 4, 11, 13, 14, 11, 43,
+ 13, 8, 23, 61, 43, 23, 12, 24, 20, 32,
+ 7, 7, 4, 11, 13, 14, 11, 43, 13, 8,
+ 23, 61, 43, 23, 12, 24, 20, 32, 58, 35,
+ 82, 29, 23, 23, 11, 7, 7, 10, 4, 24,
+ 4, 40, 11, 29, 74, 14, 11, 40, 11, 29,
+ 74, 14, 11, 40, 11, 29, 74, 14, 11, 13,
+ 2, 26, 26, 30, 50, 30, 50, 14, 50, 2,
+ 23, 50, 2, 23, 13, 42, 34, 18, 13, 30,
+ 18, 2, 14, 92, 78, 42, 2, 40, 36, 30,
+ 13, 74, 44, 26, 13, 18, 36, 22, 18, 14,
+ 50, 28, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 31 */
+
+ 14, 27, 2, 29, 49, 1, 125, 125, 125, 1,
+ 1, 125, 7, 125, 125, 125, 7, 36, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 18, 18, 21, 29, 12, 18, 24, 1,
+ 2, 2, 5, 5, 6, 9, 13, 14, 9, 41,
+ 13, 10, 21, 61, 41, 21, 16, 26, 22, 36,
+ 5, 5, 6, 9, 13, 14, 9, 41, 13, 10,
+ 21, 61, 41, 21, 16, 26, 22, 36, 60, 35,
+ 82, 29, 21, 21, 9, 5, 5, 12, 6, 26,
+ 6, 42, 9, 29, 72, 14, 9, 42, 9, 29,
+ 72, 14, 9, 42, 9, 29, 72, 14, 9, 13,
+ 2, 24, 24, 30, 50, 30, 50, 14, 50, 2,
+ 21, 50, 2, 21, 13, 44, 34, 18, 13, 30,
+ 18, 2, 14, 96, 78, 44, 2, 42, 38, 30,
+ 13, 72, 44, 24, 13, 14, 38, 20, 18, 14,
+ 50, 28, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 32 */
+
+ 14, 29, 2, 29, 49, 1, 125, 125, 125, 1,
+ 1, 125, 9, 125, 125, 125, 9, 38, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 18, 18, 21, 29, 14, 18, 22, 1,
+ 2, 2, 5, 5, 6, 9, 13, 14, 9, 41,
+ 13, 10, 21, 61, 41, 21, 18, 26, 22, 38,
+ 5, 5, 6, 9, 13, 14, 9, 41, 13, 10,
+ 21, 61, 41, 21, 18, 26, 22, 38, 62, 37,
+ 82, 29, 21, 21, 9, 5, 5, 14, 6, 26,
+ 6, 42, 9, 29, 70, 14, 9, 42, 9, 29,
+ 70, 14, 9, 42, 9, 29, 70, 14, 9, 13,
+ 2, 22, 22, 30, 50, 30, 50, 14, 50, 2,
+ 21, 50, 2, 21, 13, 46, 34, 18, 13, 30,
+ 18, 2, 14, 98, 78, 46, 2, 42, 38, 30,
+ 13, 70, 42, 22, 13, 10, 38, 18, 18, 14,
+ 50, 26, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 33 */
+
+ 14, 29, 4, 27, 49, 1, 125, 125, 125, 1,
+ 1, 125, 9, 125, 125, 125, 9, 42, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 20, 20, 19, 27, 18, 20, 22, 1,
+ 4, 4, 3, 3, 8, 7, 11, 14, 7, 39,
+ 11, 12, 19, 59, 39, 19, 22, 28, 24, 42,
+ 3, 3, 8, 7, 11, 14, 7, 39, 11, 12,
+ 19, 59, 39, 19, 22, 28, 24, 42, 66, 37,
+ 84, 27, 19, 19, 7, 3, 3, 18, 8, 28,
+ 8, 44, 7, 27, 70, 14, 7, 44, 7, 27,
+ 70, 14, 7, 44, 7, 27, 70, 14, 7, 11,
+ 4, 22, 22, 30, 52, 30, 52, 14, 52, 4,
+ 19, 52, 4, 19, 11, 50, 36, 20, 11, 30,
+ 20, 4, 14, 102, 78, 50, 4, 44, 40, 30,
+ 11, 70, 42, 22, 11, 8, 40, 18, 20, 14,
+ 52, 26, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 34 */
+
+ 14, 31, 4, 27, 49, 1, 125, 125, 125, 1,
+ 1, 125, 11, 125, 125, 125, 11, 46, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 20, 20, 17, 27, 20, 20, 20, 1,
+ 4, 4, 1, 1, 10, 5, 11, 14, 5, 37,
+ 11, 14, 17, 59, 37, 17, 26, 30, 26, 46,
+ 1, 1, 10, 5, 11, 14, 5, 37, 11, 14,
+ 17, 59, 37, 17, 26, 30, 26, 46, 68, 37,
+ 84, 27, 17, 17, 5, 1, 1, 20, 10, 30,
+ 10, 46, 5, 27, 68, 14, 5, 46, 5, 27,
+ 68, 14, 5, 46, 5, 27, 68, 14, 5, 11,
+ 4, 20, 20, 30, 52, 30, 52, 14, 52, 4,
+ 17, 52, 4, 17, 11, 52, 36, 20, 11, 30,
+ 20, 4, 14, 106, 78, 52, 4, 46, 42, 30,
+ 11, 68, 42, 20, 11, 4, 42, 16, 20, 14,
+ 52, 26, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 35 */
+
+ 14, 33, 4, 27, 49, 1, 125, 125, 125, 1,
+ 1, 125, 11, 125, 125, 125, 11, 50, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 20, 20, 15, 27, 22, 20, 20, 1,
+ 4, 4, 0, 0, 10, 5, 11, 14, 5, 37,
+ 11, 16, 15, 59, 37, 15, 28, 32, 26, 50,
+ 0, 0, 10, 5, 11, 14, 5, 37, 11, 16,
+ 15, 59, 37, 15, 28, 32, 26, 50, 70, 37,
+ 84, 27, 15, 15, 5, 0, 0, 22, 10, 32,
+ 10, 48, 5, 27, 68, 14, 5, 48, 5, 27,
+ 68, 14, 5, 48, 5, 27, 68, 14, 5, 11,
+ 4, 20, 20, 30, 52, 30, 52, 14, 52, 4,
+ 15, 52, 4, 15, 11, 54, 36, 20, 11, 30,
+ 20, 4, 14, 108, 78, 54, 4, 48, 42, 30,
+ 11, 68, 42, 20, 11, 2, 42, 14, 20, 14,
+ 52, 26, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 36 */
+
+ 14, 35, 6, 25, 49, 1, 125, 125, 125, 1,
+ 1, 125, 13, 125, 125, 125, 13, 54, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 22, 22, 13, 25, 24, 22, 18, 1,
+ 6, 6, 2, 2, 12, 3, 9, 14, 3, 35,
+ 9, 18, 13, 57, 35, 13, 32, 34, 28, 54,
+ 2, 2, 12, 3, 9, 14, 3, 35, 9, 18,
+ 13, 57, 35, 13, 32, 34, 28, 54, 72, 39,
+ 86, 25, 13, 13, 3, 2, 2, 24, 12, 34,
+ 12, 50, 3, 25, 66, 14, 3, 50, 3, 25,
+ 66, 14, 3, 50, 3, 25, 66, 14, 3, 9,
+ 6, 18, 18, 30, 54, 30, 54, 14, 54, 6,
+ 13, 54, 6, 13, 9, 56, 38, 22, 9, 30,
+ 22, 6, 14, 112, 78, 56, 6, 50, 44, 30,
+ 9, 66, 40, 18, 9, 1, 44, 12, 22, 14,
+ 54, 24, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 37 */
+
+ 14, 37, 6, 25, 49, 1, 125, 125, 125, 1,
+ 1, 125, 15, 125, 125, 125, 15, 58, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 22, 22, 11, 25, 28, 22, 16, 1,
+ 6, 6, 4, 4, 14, 1, 9, 14, 1, 33,
+ 9, 20, 11, 57, 33, 11, 34, 36, 30, 58,
+ 4, 4, 14, 1, 9, 14, 1, 33, 9, 20,
+ 11, 57, 33, 11, 34, 36, 30, 58, 76, 39,
+ 86, 25, 11, 11, 1, 4, 4, 28, 14, 36,
+ 14, 52, 1, 25, 64, 14, 1, 52, 1, 25,
+ 64, 14, 1, 52, 1, 25, 64, 14, 1, 9,
+ 6, 16, 16, 30, 54, 30, 54, 14, 54, 6,
+ 11, 54, 6, 11, 9, 60, 38, 22, 9, 30,
+ 22, 6, 14, 114, 78, 60, 6, 52, 46, 30,
+ 9, 64, 40, 16, 9, 3, 46, 10, 22, 14,
+ 54, 24, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 38 */
+
+ 14, 39, 6, 25, 49, 1, 125, 125, 125, 1,
+ 1, 125, 15, 125, 125, 125, 15, 62, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 22, 22, 9, 25, 30, 22, 16, 1,
+ 6, 6, 6, 6, 14, 1, 9, 14, 1, 33,
+ 9, 22, 9, 57, 33, 9, 38, 38, 30, 62,
+ 6, 6, 14, 1, 9, 14, 1, 33, 9, 22,
+ 9, 57, 33, 9, 38, 38, 30, 62, 78, 39,
+ 86, 25, 9, 9, 1, 6, 6, 30, 14, 38,
+ 14, 54, 1, 25, 64, 14, 1, 54, 1, 25,
+ 64, 14, 1, 54, 1, 25, 64, 14, 1, 9,
+ 6, 16, 16, 30, 54, 30, 54, 14, 54, 6,
+ 9, 54, 6, 9, 9, 62, 38, 22, 9, 30,
+ 22, 6, 14, 118, 78, 62, 6, 54, 46, 30,
+ 9, 64, 40, 16, 9, 7, 46, 8, 22, 14,
+ 54, 24, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 39 */
+
+ 14, 41, 8, 23, 49, 1, 125, 125, 125, 1,
+ 1, 125, 17, 125, 125, 125, 17, 66, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 24, 24, 7, 23, 32, 24, 14, 1,
+ 8, 8, 8, 8, 16, 0, 7, 14, 0, 31,
+ 7, 24, 7, 55, 31, 7, 40, 40, 32, 66,
+ 8, 8, 16, 0, 7, 14, 0, 31, 7, 24,
+ 7, 55, 31, 7, 40, 40, 32, 66, 80, 41,
+ 88, 23, 7, 7, 0, 8, 8, 32, 16, 40,
+ 16, 56, 0, 23, 62, 14, 0, 56, 0, 23,
+ 62, 14, 0, 56, 0, 23, 62, 14, 0, 7,
+ 8, 14, 14, 30, 56, 30, 56, 14, 56, 8,
+ 7, 56, 8, 7, 7, 64, 40, 24, 7, 30,
+ 24, 8, 14, 120, 78, 64, 8, 56, 48, 30,
+ 7, 62, 38, 14, 7, 9, 48, 6, 24, 14,
+ 56, 22, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 40 */
+
+ 14, 43, 8, 23, 49, 1, 125, 125, 125, 1,
+ 1, 125, 19, 125, 125, 125, 19, 68, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 24, 24, 5, 23, 34, 24, 12, 1,
+ 8, 8, 10, 10, 16, 0, 7, 14, 0, 31,
+ 7, 26, 5, 55, 31, 5, 44, 42, 32, 68,
+ 10, 10, 16, 0, 7, 14, 0, 31, 7, 26,
+ 5, 55, 31, 5, 44, 42, 32, 68, 82, 41,
+ 88, 23, 5, 5, 0, 10, 10, 34, 16, 42,
+ 16, 58, 0, 23, 60, 14, 0, 58, 0, 23,
+ 60, 14, 0, 58, 0, 23, 60, 14, 0, 7,
+ 8, 12, 12, 30, 56, 30, 56, 14, 56, 8,
+ 5, 56, 8, 5, 7, 66, 40, 24, 7, 30,
+ 24, 8, 14, 124, 78, 66, 8, 58, 48, 30,
+ 7, 60, 38, 12, 7, 13, 48, 4, 24, 14,
+ 56, 22, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 41 */
+
+ 14, 45, 8, 23, 49, 1, 125, 125, 125, 1,
+ 1, 125, 19, 125, 125, 125, 19, 72, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 24, 24, 3, 23, 38, 24, 12, 1,
+ 8, 8, 12, 12, 18, 2, 7, 14, 2, 29,
+ 7, 28, 3, 55, 29, 3, 48, 44, 34, 72,
+ 12, 12, 18, 2, 7, 14, 2, 29, 7, 28,
+ 3, 55, 29, 3, 48, 44, 34, 72, 86, 41,
+ 88, 23, 3, 3, 2, 12, 12, 38, 18, 44,
+ 18, 60, 2, 23, 60, 14, 2, 60, 2, 23,
+ 60, 14, 2, 60, 2, 23, 60, 14, 2, 7,
+ 8, 12, 12, 30, 56, 30, 56, 14, 56, 8,
+ 3, 56, 8, 3, 7, 70, 40, 24, 7, 30,
+ 24, 8, 14, 124, 78, 70, 8, 60, 50, 30,
+ 7, 60, 38, 12, 7, 17, 50, 2, 24, 14,
+ 56, 22, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 42 */
+
+ 14, 47, 10, 21, 49, 1, 125, 125, 125, 1,
+ 1, 125, 21, 125, 125, 125, 21, 76, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 26, 26, 1, 21, 40, 26, 10, 1,
+ 10, 10, 14, 14, 20, 4, 5, 14, 4, 27,
+ 5, 30, 1, 53, 27, 1, 50, 46, 36, 76,
+ 14, 14, 20, 4, 5, 14, 4, 27, 5, 30,
+ 1, 53, 27, 1, 50, 46, 36, 76, 88, 43,
+ 90, 21, 1, 1, 4, 14, 14, 40, 20, 46,
+ 20, 62, 4, 21, 58, 14, 4, 62, 4, 21,
+ 58, 14, 4, 62, 4, 21, 58, 14, 4, 5,
+ 10, 10, 10, 30, 58, 30, 58, 14, 58, 10,
+ 1, 58, 10, 1, 5, 72, 42, 26, 5, 30,
+ 26, 10, 14, 124, 78, 72, 10, 62, 52, 30,
+ 5, 58, 36, 10, 5, 19, 52, 0, 26, 14,
+ 58, 20, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 43 */
+
+ 14, 49, 10, 21, 49, 1, 125, 125, 125, 1,
+ 1, 125, 21, 125, 125, 125, 21, 80, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 26, 26, 0, 21, 42, 26, 10, 1,
+ 10, 10, 16, 16, 20, 4, 5, 14, 4, 27,
+ 5, 32, 0, 53, 27, 0, 54, 48, 36, 80,
+ 16, 16, 20, 4, 5, 14, 4, 27, 5, 32,
+ 0, 53, 27, 0, 54, 48, 36, 80, 90, 43,
+ 90, 21, 0, 0, 4, 16, 16, 42, 20, 48,
+ 20, 64, 4, 21, 58, 14, 4, 64, 4, 21,
+ 58, 14, 4, 64, 4, 21, 58, 14, 4, 5,
+ 10, 10, 10, 30, 58, 30, 58, 14, 58, 10,
+ 0, 58, 10, 0, 5, 74, 42, 26, 5, 30,
+ 26, 10, 14, 124, 78, 74, 10, 64, 52, 30,
+ 5, 58, 36, 10, 5, 23, 52, 1, 26, 14,
+ 58, 20, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 44 */
+
+ 14, 51, 10, 21, 49, 1, 125, 125, 125, 1,
+ 1, 125, 23, 125, 125, 125, 23, 84, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 26, 26, 2, 21, 44, 26, 8, 1,
+ 10, 10, 18, 18, 22, 6, 5, 14, 6, 25,
+ 5, 34, 2, 53, 25, 2, 56, 50, 38, 84,
+ 18, 18, 22, 6, 5, 14, 6, 25, 5, 34,
+ 2, 53, 25, 2, 56, 50, 38, 84, 92, 43,
+ 90, 21, 2, 2, 6, 18, 18, 44, 22, 50,
+ 22, 66, 6, 21, 56, 14, 6, 66, 6, 21,
+ 56, 14, 6, 66, 6, 21, 56, 14, 6, 5,
+ 10, 8, 8, 30, 58, 30, 58, 14, 58, 10,
+ 2, 58, 10, 2, 5, 76, 42, 26, 5, 30,
+ 26, 10, 14, 124, 78, 76, 10, 66, 54, 30,
+ 5, 56, 36, 8, 5, 25, 54, 3, 26, 14,
+ 58, 20, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 45 */
+
+ 14, 53, 12, 19, 49, 1, 125, 125, 125, 1,
+ 1, 125, 25, 125, 125, 125, 25, 88, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 28, 28, 4, 19, 48, 28, 6, 1,
+ 12, 12, 20, 20, 24, 8, 3, 14, 8, 23,
+ 3, 36, 4, 51, 23, 4, 60, 52, 40, 88,
+ 20, 20, 24, 8, 3, 14, 8, 23, 3, 36,
+ 4, 51, 23, 4, 60, 52, 40, 88, 96, 45,
+ 92, 19, 4, 4, 8, 20, 20, 48, 24, 52,
+ 24, 68, 8, 19, 54, 14, 8, 68, 8, 19,
+ 54, 14, 8, 68, 8, 19, 54, 14, 8, 3,
+ 12, 6, 6, 30, 60, 30, 60, 14, 60, 12,
+ 4, 60, 12, 4, 3, 80, 44, 28, 3, 30,
+ 28, 12, 14, 124, 78, 80, 12, 68, 56, 30,
+ 3, 54, 34, 6, 3, 29, 56, 5, 28, 14,
+ 60, 18, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 46 */
+
+ 14, 55, 12, 19, 49, 1, 125, 125, 125, 1,
+ 1, 125, 25, 125, 125, 125, 25, 92, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 28, 28, 6, 19, 50, 28, 6, 1,
+ 12, 12, 22, 22, 24, 8, 3, 14, 8, 23,
+ 3, 38, 6, 51, 23, 6, 62, 54, 40, 92,
+ 22, 22, 24, 8, 3, 14, 8, 23, 3, 38,
+ 6, 51, 23, 6, 62, 54, 40, 92, 98, 45,
+ 92, 19, 6, 6, 8, 22, 22, 50, 24, 54,
+ 24, 70, 8, 19, 54, 14, 8, 70, 8, 19,
+ 54, 14, 8, 70, 8, 19, 54, 14, 8, 3,
+ 12, 6, 6, 30, 60, 30, 60, 14, 60, 12,
+ 6, 60, 12, 6, 3, 82, 44, 28, 3, 30,
+ 28, 12, 14, 124, 78, 82, 12, 70, 56, 30,
+ 3, 54, 34, 6, 3, 31, 56, 7, 28, 14,
+ 60, 18, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 47 */
+
+ 14, 57, 12, 19, 49, 1, 125, 125, 125, 1,
+ 1, 125, 27, 125, 125, 125, 27, 96, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 28, 28, 8, 19, 52, 28, 4, 1,
+ 12, 12, 24, 24, 26, 10, 3, 14, 10, 21,
+ 3, 40, 8, 51, 21, 8, 66, 56, 42, 96,
+ 24, 24, 26, 10, 3, 14, 10, 21, 3, 40,
+ 8, 51, 21, 8, 66, 56, 42, 96, 100, 45,
+ 92, 19, 8, 8, 10, 24, 24, 52, 26, 56,
+ 26, 72, 10, 19, 52, 14, 10, 72, 10, 19,
+ 52, 14, 10, 72, 10, 19, 52, 14, 10, 3,
+ 12, 4, 4, 30, 60, 30, 60, 14, 60, 12,
+ 8, 60, 12, 8, 3, 84, 44, 28, 3, 30,
+ 28, 12, 14, 124, 78, 84, 12, 72, 58, 30,
+ 3, 52, 34, 4, 3, 35, 58, 9, 28, 14,
+ 60, 18, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 48 */
+
+ 14, 59, 12, 19, 49, 1, 125, 125, 125, 1,
+ 1, 125, 29, 125, 125, 125, 29, 98, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 28, 28, 8, 19, 54, 28, 2, 1,
+ 12, 12, 24, 24, 26, 10, 3, 14, 10, 21,
+ 3, 40, 8, 51, 21, 8, 68, 56, 42, 98,
+ 24, 24, 26, 10, 3, 14, 10, 21, 3, 40,
+ 8, 51, 21, 8, 68, 56, 42, 98, 102, 47,
+ 92, 19, 8, 8, 10, 24, 24, 54, 26, 56,
+ 26, 72, 10, 19, 50, 14, 10, 72, 10, 19,
+ 50, 14, 10, 72, 10, 19, 50, 14, 10, 3,
+ 12, 2, 2, 30, 60, 30, 60, 14, 60, 12,
+ 8, 60, 12, 8, 3, 86, 44, 28, 3, 30,
+ 28, 12, 14, 124, 78, 86, 12, 72, 58, 30,
+ 3, 50, 32, 2, 3, 39, 58, 11, 28, 14,
+ 60, 16, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 49 */
+
+ 14, 59, 14, 17, 49, 1, 125, 125, 125, 1,
+ 1, 125, 29, 125, 125, 125, 29, 102, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 30, 30, 10, 17, 58, 30, 2, 1,
+ 14, 14, 26, 26, 28, 12, 1, 14, 12, 19,
+ 1, 42, 10, 49, 19, 10, 72, 58, 44, 102,
+ 26, 26, 28, 12, 1, 14, 12, 19, 1, 42,
+ 10, 49, 19, 10, 72, 58, 44, 102, 106, 47,
+ 94, 17, 10, 10, 12, 26, 26, 58, 28, 58,
+ 28, 74, 12, 17, 50, 14, 12, 74, 12, 17,
+ 50, 14, 12, 74, 12, 17, 50, 14, 12, 1,
+ 14, 2, 2, 30, 62, 30, 62, 14, 62, 14,
+ 10, 62, 14, 10, 1, 90, 46, 30, 1, 30,
+ 30, 14, 14, 124, 78, 90, 14, 74, 60, 30,
+ 1, 50, 32, 2, 1, 41, 60, 11, 30, 14,
+ 62, 16, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 50 */
+
+ 14, 61, 14, 17, 49, 1, 125, 125, 125, 1,
+ 1, 125, 31, 125, 125, 125, 31, 106, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 30, 30, 12, 17, 60, 30, 0, 1,
+ 14, 14, 28, 28, 30, 14, 1, 14, 14, 17,
+ 1, 44, 12, 49, 17, 12, 76, 60, 46, 106,
+ 28, 28, 30, 14, 1, 14, 14, 17, 1, 44,
+ 12, 49, 17, 12, 76, 60, 46, 106, 108, 47,
+ 94, 17, 12, 12, 14, 28, 28, 60, 30, 60,
+ 30, 76, 14, 17, 48, 14, 14, 76, 14, 17,
+ 48, 14, 14, 76, 14, 17, 48, 14, 14, 1,
+ 14, 0, 0, 30, 62, 30, 62, 14, 62, 14,
+ 12, 62, 14, 12, 1, 92, 46, 30, 1, 30,
+ 30, 14, 14, 124, 78, 92, 14, 76, 62, 30,
+ 1, 48, 32, 0, 1, 45, 62, 13, 30, 14,
+ 62, 16, 30, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 0, qp = 51 */
+
+ 14, 63, 14, 17, 49, 1, 125, 125, 125, 1,
+ 1, 125, 31, 125, 125, 125, 31, 110, 125, 125,
+ 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+ 125, 14, 30, 30, 14, 17, 62, 30, 0, 1,
+ 14, 14, 30, 30, 30, 14, 1, 14, 14, 17,
+ 1, 46, 14, 49, 17, 14, 78, 62, 46, 110,
+ 30, 30, 30, 14, 1, 14, 14, 17, 1, 46,
+ 14, 49, 17, 14, 78, 62, 46, 110, 110, 47,
+ 94, 17, 14, 14, 14, 30, 30, 62, 30, 62,
+ 30, 78, 14, 17, 48, 14, 14, 78, 14, 17,
+ 48, 14, 14, 78, 14, 17, 48, 14, 14, 1,
+ 14, 0, 0, 30, 62, 30, 62, 14, 62, 14,
+ 14, 62, 14, 14, 1, 94, 46, 30, 1, 30,
+ 30, 14, 14, 124, 78, 94, 14, 78, 62, 30,
+ 1, 48, 32, 0, 1, 47, 62, 15, 30, 14,
+ 62, 16, 30, 30,
+ },
+
+ },
+
+ {
+ {
+ /* Context Tables for init_idc = 1, qp = 0 */
+
+ 14, 14, 17, 17, 65, 1, 78, 14, 14, 1,
+ 1, 78, 1, 17, 1, 1, 1, 30, 65, 1,
+ 81, 81, 81, 81, 81, 14, 14, 33, 62, 30,
+ 81, 33, 1, 65, 14, 81, 78, 17, 46, 1,
+ 17, 17, 49, 65, 65, 65, 81, 81, 49, 81,
+ 65, 65, 65, 81, 81, 81, 65, 33, 17, 33,
+ 49, 65, 65, 65, 81, 81, 49, 81, 65, 65,
+ 65, 81, 81, 81, 65, 33, 17, 33, 14, 33,
+ 49, 1, 17, 1, 17, 14, 17, 17, 17, 81,
+ 14, 62, 46, 33, 30, 14, 1, 62, 46, 33,
+ 30, 14, 1, 62, 46, 33, 30, 14, 1, 1,
+ 14, 17, 17, 17, 14, 17, 14, 46, 46, 46,
+ 33, 46, 46, 33, 1, 94, 94, 46, 1, 30,
+ 46, 62, 62, 62, 78, 30, 14, 14, 30, 14,
+ 14, 124, 62, 46, 1, 46, 14, 62, 17, 46,
+ 17, 1, 17, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 1 */
+
+ 14, 14, 15, 15, 63, 1, 78, 14, 14, 1,
+ 1, 78, 1, 15, 1, 1, 1, 30, 63, 0,
+ 77, 77, 77, 75, 75, 14, 14, 31, 62, 30,
+ 77, 31, 0, 61, 14, 79, 78, 15, 46, 1,
+ 15, 15, 47, 63, 61, 63, 77, 77, 47, 79,
+ 63, 61, 63, 79, 79, 77, 61, 31, 15, 31,
+ 47, 63, 61, 63, 77, 77, 47, 79, 63, 61,
+ 63, 79, 79, 77, 61, 31, 15, 31, 16, 31,
+ 45, 1, 17, 1, 15, 14, 15, 15, 15, 77,
+ 14, 62, 46, 31, 32, 14, 1, 62, 46, 31,
+ 32, 14, 1, 62, 46, 31, 32, 14, 1, 1,
+ 14, 15, 15, 15, 16, 15, 16, 46, 46, 46,
+ 31, 46, 46, 31, 1, 94, 94, 46, 1, 30,
+ 46, 62, 62, 64, 78, 32, 14, 16, 32, 16,
+ 14, 124, 62, 46, 1, 46, 16, 62, 15, 46,
+ 13, 0, 15, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 2 */
+
+ 14, 12, 13, 15, 61, 1, 76, 12, 12, 1,
+ 1, 78, 1, 15, 1, 1, 1, 30, 61, 2,
+ 75, 73, 73, 71, 71, 14, 14, 31, 60, 30,
+ 73, 29, 0, 59, 14, 77, 78, 13, 46, 1,
+ 15, 15, 45, 61, 59, 61, 75, 73, 45, 77,
+ 61, 57, 61, 77, 77, 75, 59, 29, 13, 29,
+ 45, 61, 59, 61, 75, 73, 45, 77, 61, 57,
+ 61, 77, 77, 75, 59, 29, 13, 29, 18, 31,
+ 41, 1, 17, 1, 15, 14, 15, 13, 13, 73,
+ 14, 62, 44, 31, 32, 14, 1, 62, 44, 31,
+ 32, 14, 1, 62, 44, 31, 32, 14, 1, 1,
+ 14, 13, 13, 13, 18, 13, 18, 46, 46, 44,
+ 31, 46, 44, 31, 1, 92, 92, 46, 1, 30,
+ 46, 60, 60, 64, 78, 32, 14, 18, 32, 16,
+ 14, 124, 62, 46, 1, 46, 16, 60, 13, 46,
+ 11, 2, 13, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 3 */
+
+ 14, 12, 11, 15, 61, 1, 74, 12, 10, 1,
+ 1, 78, 1, 15, 1, 1, 1, 30, 59, 2,
+ 73, 71, 69, 65, 65, 14, 14, 31, 58, 30,
+ 71, 29, 0, 57, 14, 75, 78, 11, 46, 1,
+ 15, 15, 45, 59, 57, 59, 73, 71, 45, 75,
+ 59, 55, 59, 75, 75, 73, 57, 27, 13, 27,
+ 45, 59, 57, 59, 73, 71, 45, 75, 59, 55,
+ 59, 75, 75, 73, 57, 27, 13, 27, 18, 31,
+ 37, 1, 17, 1, 15, 14, 15, 13, 13, 69,
+ 14, 62, 44, 31, 32, 14, 1, 62, 44, 31,
+ 32, 14, 1, 62, 44, 31, 32, 14, 1, 1,
+ 14, 13, 13, 11, 18, 11, 18, 46, 46, 44,
+ 31, 46, 44, 31, 1, 90, 90, 46, 1, 30,
+ 46, 60, 60, 64, 78, 32, 14, 18, 32, 16,
+ 14, 122, 62, 46, 1, 46, 16, 60, 11, 46,
+ 9, 2, 11, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 4 */
+
+ 14, 10, 9, 13, 59, 1, 72, 10, 8, 1,
+ 1, 78, 1, 13, 1, 1, 1, 30, 57, 4,
+ 71, 67, 65, 61, 61, 14, 14, 29, 56, 28,
+ 67, 27, 2, 55, 14, 73, 78, 9, 44, 1,
+ 13, 13, 43, 57, 55, 57, 71, 67, 43, 73,
+ 57, 51, 57, 73, 73, 71, 55, 25, 11, 25,
+ 43, 57, 55, 57, 71, 67, 43, 73, 57, 51,
+ 57, 73, 73, 71, 55, 25, 11, 25, 20, 29,
+ 33, 1, 17, 1, 13, 14, 13, 11, 11, 65,
+ 14, 60, 42, 29, 34, 14, 1, 60, 42, 29,
+ 34, 14, 1, 60, 42, 29, 34, 14, 1, 3,
+ 14, 11, 11, 9, 20, 9, 20, 44, 46, 42,
+ 29, 46, 42, 29, 1, 88, 88, 44, 1, 30,
+ 44, 58, 58, 66, 78, 34, 14, 20, 34, 18,
+ 12, 120, 60, 44, 1, 44, 18, 58, 9, 44,
+ 7, 4, 9, 44,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 5 */
+
+ 14, 8, 7, 13, 57, 1, 70, 8, 6, 1,
+ 1, 78, 1, 13, 1, 1, 1, 30, 55, 6,
+ 67, 65, 61, 55, 55, 14, 14, 29, 54, 28,
+ 65, 25, 2, 51, 14, 71, 78, 7, 44, 1,
+ 13, 13, 41, 55, 51, 55, 67, 65, 41, 71,
+ 55, 49, 55, 71, 71, 67, 51, 23, 9, 23,
+ 41, 55, 51, 55, 67, 65, 41, 71, 55, 49,
+ 55, 71, 71, 67, 51, 23, 9, 23, 22, 29,
+ 29, 1, 17, 1, 13, 14, 13, 9, 9, 61,
+ 14, 60, 40, 29, 34, 14, 1, 60, 40, 29,
+ 34, 14, 1, 60, 40, 29, 34, 14, 1, 3,
+ 14, 9, 9, 7, 22, 7, 22, 44, 46, 40,
+ 29, 46, 40, 29, 1, 86, 86, 44, 1, 30,
+ 44, 56, 56, 66, 78, 34, 14, 22, 34, 18,
+ 12, 118, 60, 44, 1, 44, 18, 56, 7, 44,
+ 3, 6, 7, 44,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 6 */
+
+ 14, 8, 5, 13, 57, 1, 68, 8, 4, 1,
+ 1, 78, 1, 13, 1, 1, 1, 30, 53, 6,
+ 65, 61, 57, 51, 51, 14, 14, 29, 52, 28,
+ 61, 25, 2, 49, 14, 69, 78, 5, 44, 1,
+ 13, 13, 41, 53, 49, 53, 65, 61, 41, 69,
+ 53, 45, 53, 69, 69, 65, 49, 21, 9, 21,
+ 41, 53, 49, 53, 65, 61, 41, 69, 53, 45,
+ 53, 69, 69, 65, 49, 21, 9, 21, 22, 29,
+ 25, 1, 17, 1, 13, 14, 13, 9, 9, 57,
+ 14, 60, 40, 29, 34, 14, 1, 60, 40, 29,
+ 34, 14, 1, 60, 40, 29, 34, 14, 1, 3,
+ 14, 9, 9, 5, 22, 5, 22, 44, 46, 40,
+ 29, 46, 40, 29, 1, 84, 84, 44, 1, 30,
+ 44, 56, 56, 66, 78, 34, 14, 22, 34, 18,
+ 12, 116, 60, 44, 1, 44, 18, 56, 5, 44,
+ 1, 6, 5, 44,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 7 */
+
+ 14, 6, 3, 11, 55, 1, 66, 6, 2, 1,
+ 1, 78, 1, 11, 1, 1, 1, 30, 51, 8,
+ 63, 59, 53, 45, 45, 14, 14, 27, 50, 26,
+ 59, 23, 4, 47, 14, 67, 78, 3, 42, 1,
+ 11, 11, 39, 51, 47, 51, 63, 59, 39, 67,
+ 51, 43, 51, 67, 67, 63, 47, 19, 7, 19,
+ 39, 51, 47, 51, 63, 59, 39, 67, 51, 43,
+ 51, 67, 67, 63, 47, 19, 7, 19, 24, 27,
+ 21, 1, 17, 1, 11, 14, 11, 7, 7, 53,
+ 14, 58, 38, 27, 36, 14, 1, 58, 38, 27,
+ 36, 14, 1, 58, 38, 27, 36, 14, 1, 5,
+ 14, 7, 7, 3, 24, 3, 24, 42, 46, 38,
+ 27, 46, 38, 27, 1, 82, 82, 42, 1, 30,
+ 42, 54, 54, 68, 78, 36, 14, 24, 36, 20,
+ 10, 114, 58, 42, 1, 42, 20, 54, 3, 42,
+ 0, 8, 3, 42,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 8 */
+
+ 14, 4, 1, 11, 55, 1, 64, 4, 0, 1,
+ 1, 78, 1, 11, 1, 1, 1, 30, 49, 8,
+ 61, 55, 51, 41, 41, 14, 14, 27, 48, 26,
+ 55, 23, 4, 45, 14, 65, 78, 1, 42, 1,
+ 11, 11, 39, 49, 45, 49, 61, 55, 39, 65,
+ 49, 39, 49, 65, 65, 61, 45, 17, 7, 17,
+ 39, 49, 45, 49, 61, 55, 39, 65, 49, 39,
+ 49, 65, 65, 61, 45, 17, 7, 17, 24, 27,
+ 19, 1, 17, 1, 11, 14, 11, 7, 7, 51,
+ 14, 58, 36, 27, 36, 14, 1, 58, 36, 27,
+ 36, 14, 1, 58, 36, 27, 36, 14, 1, 5,
+ 14, 7, 7, 1, 24, 1, 24, 42, 46, 36,
+ 27, 46, 36, 27, 1, 80, 80, 42, 1, 30,
+ 42, 52, 52, 68, 78, 36, 14, 24, 36, 20,
+ 10, 112, 58, 42, 1, 42, 20, 52, 1, 42,
+ 2, 8, 1, 42,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 9 */
+
+ 14, 4, 0, 11, 53, 1, 62, 4, 1, 1,
+ 1, 78, 1, 11, 1, 1, 1, 30, 47, 10,
+ 57, 51, 47, 35, 35, 14, 14, 27, 46, 26,
+ 51, 21, 4, 41, 14, 63, 78, 0, 42, 1,
+ 11, 11, 37, 47, 41, 47, 57, 51, 37, 63,
+ 47, 35, 47, 63, 63, 57, 41, 15, 5, 15,
+ 37, 47, 41, 47, 57, 51, 37, 63, 47, 35,
+ 47, 63, 63, 57, 41, 15, 5, 15, 26, 27,
+ 15, 1, 17, 1, 11, 14, 11, 5, 5, 47,
+ 14, 58, 36, 27, 36, 14, 1, 58, 36, 27,
+ 36, 14, 1, 58, 36, 27, 36, 14, 1, 5,
+ 14, 5, 5, 0, 26, 0, 26, 42, 46, 36,
+ 27, 46, 36, 27, 1, 78, 78, 42, 1, 30,
+ 42, 52, 52, 68, 78, 36, 14, 26, 36, 20,
+ 10, 110, 58, 42, 1, 42, 20, 52, 0, 42,
+ 6, 10, 0, 42,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 10 */
+
+ 14, 2, 2, 9, 51, 1, 60, 2, 3, 1,
+ 1, 78, 1, 9, 1, 1, 1, 30, 45, 12,
+ 55, 49, 43, 31, 31, 14, 14, 25, 44, 24,
+ 49, 19, 6, 39, 14, 61, 78, 2, 40, 1,
+ 9, 9, 35, 45, 39, 45, 55, 49, 35, 61,
+ 45, 33, 45, 61, 61, 55, 39, 13, 3, 13,
+ 35, 45, 39, 45, 55, 49, 35, 61, 45, 33,
+ 45, 61, 61, 55, 39, 13, 3, 13, 28, 25,
+ 11, 1, 17, 1, 9, 14, 9, 3, 3, 43,
+ 14, 56, 34, 25, 38, 14, 1, 56, 34, 25,
+ 38, 14, 1, 56, 34, 25, 38, 14, 1, 7,
+ 14, 3, 3, 2, 28, 2, 28, 40, 46, 34,
+ 25, 46, 34, 25, 1, 76, 76, 40, 1, 30,
+ 40, 50, 50, 70, 78, 38, 14, 28, 38, 22,
+ 8, 108, 56, 40, 1, 40, 22, 50, 2, 40,
+ 8, 12, 2, 40,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 11 */
+
+ 14, 2, 4, 9, 51, 1, 58, 2, 5, 1,
+ 1, 78, 1, 9, 1, 1, 1, 30, 43, 12,
+ 53, 45, 39, 25, 25, 14, 14, 25, 42, 24,
+ 45, 19, 6, 37, 14, 59, 78, 4, 40, 1,
+ 9, 9, 35, 43, 37, 43, 53, 45, 35, 59,
+ 43, 29, 43, 59, 59, 53, 37, 11, 3, 11,
+ 35, 43, 37, 43, 53, 45, 35, 59, 43, 29,
+ 43, 59, 59, 53, 37, 11, 3, 11, 28, 25,
+ 7, 1, 17, 1, 9, 14, 9, 3, 3, 39,
+ 14, 56, 34, 25, 38, 14, 1, 56, 34, 25,
+ 38, 14, 1, 56, 34, 25, 38, 14, 1, 7,
+ 14, 3, 3, 4, 28, 4, 28, 40, 46, 34,
+ 25, 46, 34, 25, 1, 74, 74, 40, 1, 30,
+ 40, 50, 50, 70, 78, 38, 14, 28, 38, 22,
+ 8, 106, 56, 40, 1, 40, 22, 50, 4, 40,
+ 10, 12, 4, 40,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 12 */
+
+ 14, 0, 6, 9, 49, 1, 56, 0, 7, 1,
+ 1, 78, 1, 9, 1, 1, 1, 30, 41, 14,
+ 51, 43, 35, 21, 21, 14, 14, 25, 40, 24,
+ 43, 17, 6, 35, 14, 57, 78, 6, 40, 1,
+ 9, 9, 33, 41, 35, 41, 51, 43, 33, 57,
+ 41, 27, 41, 57, 57, 51, 35, 9, 1, 9,
+ 33, 41, 35, 41, 51, 43, 33, 57, 41, 27,
+ 41, 57, 57, 51, 35, 9, 1, 9, 30, 25,
+ 3, 1, 17, 1, 9, 14, 9, 1, 1, 35,
+ 14, 56, 32, 25, 38, 14, 1, 56, 32, 25,
+ 38, 14, 1, 56, 32, 25, 38, 14, 1, 7,
+ 14, 1, 1, 6, 30, 6, 30, 40, 46, 32,
+ 25, 46, 32, 25, 1, 72, 72, 40, 1, 30,
+ 40, 48, 48, 70, 78, 38, 14, 30, 38, 22,
+ 8, 104, 56, 40, 1, 40, 22, 48, 6, 40,
+ 12, 14, 6, 40,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 13 */
+
+ 14, 1, 8, 7, 47, 1, 54, 1, 9, 1,
+ 1, 78, 1, 7, 1, 1, 1, 30, 39, 16,
+ 47, 39, 31, 15, 15, 14, 14, 23, 38, 22,
+ 39, 15, 8, 31, 14, 55, 78, 8, 38, 1,
+ 7, 7, 31, 39, 31, 39, 47, 39, 31, 55,
+ 39, 23, 39, 55, 55, 47, 31, 7, 0, 7,
+ 31, 39, 31, 39, 47, 39, 31, 55, 39, 23,
+ 39, 55, 55, 47, 31, 7, 0, 7, 32, 23,
+ 0, 1, 17, 1, 7, 14, 7, 0, 0, 31,
+ 14, 54, 30, 23, 40, 14, 1, 54, 30, 23,
+ 40, 14, 1, 54, 30, 23, 40, 14, 1, 9,
+ 14, 0, 0, 8, 32, 8, 32, 38, 46, 30,
+ 23, 46, 30, 23, 1, 70, 70, 38, 1, 30,
+ 38, 46, 46, 72, 78, 40, 14, 32, 40, 24,
+ 6, 102, 54, 38, 1, 38, 24, 46, 8, 38,
+ 16, 16, 8, 38,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 14 */
+
+ 14, 1, 10, 7, 47, 1, 52, 1, 11, 1,
+ 1, 78, 1, 7, 1, 1, 1, 30, 37, 16,
+ 45, 37, 27, 11, 11, 14, 14, 23, 36, 22,
+ 37, 15, 8, 29, 14, 53, 78, 10, 38, 1,
+ 7, 7, 31, 37, 29, 37, 45, 37, 31, 53,
+ 37, 21, 37, 53, 53, 45, 29, 5, 0, 5,
+ 31, 37, 29, 37, 45, 37, 31, 53, 37, 21,
+ 37, 53, 53, 45, 29, 5, 0, 5, 32, 23,
+ 4, 1, 17, 1, 7, 14, 7, 0, 0, 27,
+ 14, 54, 30, 23, 40, 14, 1, 54, 30, 23,
+ 40, 14, 1, 54, 30, 23, 40, 14, 1, 9,
+ 14, 0, 0, 10, 32, 10, 32, 38, 46, 30,
+ 23, 46, 30, 23, 1, 68, 68, 38, 1, 30,
+ 38, 46, 46, 72, 78, 40, 14, 32, 40, 24,
+ 6, 100, 54, 38, 1, 38, 24, 46, 10, 38,
+ 18, 16, 10, 38,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 15 */
+
+ 14, 3, 12, 7, 45, 1, 50, 3, 13, 1,
+ 1, 78, 1, 7, 1, 1, 1, 30, 35, 18,
+ 43, 33, 23, 5, 5, 14, 14, 23, 34, 22,
+ 33, 13, 8, 27, 14, 51, 78, 12, 38, 1,
+ 7, 7, 29, 35, 27, 35, 43, 33, 29, 51,
+ 35, 17, 35, 51, 51, 43, 27, 3, 2, 3,
+ 29, 35, 27, 35, 43, 33, 29, 51, 35, 17,
+ 35, 51, 51, 43, 27, 3, 2, 3, 34, 23,
+ 8, 1, 17, 1, 7, 14, 7, 2, 2, 23,
+ 14, 54, 28, 23, 40, 14, 1, 54, 28, 23,
+ 40, 14, 1, 54, 28, 23, 40, 14, 1, 9,
+ 14, 2, 2, 12, 34, 12, 34, 38, 46, 28,
+ 23, 46, 28, 23, 1, 66, 66, 38, 1, 30,
+ 38, 44, 44, 72, 78, 40, 14, 34, 40, 24,
+ 6, 98, 54, 38, 1, 38, 24, 44, 12, 38,
+ 20, 18, 12, 38,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 16 */
+
+ 14, 5, 12, 7, 45, 1, 48, 5, 15, 1,
+ 1, 78, 1, 7, 1, 1, 1, 30, 35, 18,
+ 41, 31, 21, 1, 1, 14, 14, 23, 32, 20,
+ 31, 13, 8, 25, 14, 51, 78, 12, 36, 1,
+ 7, 7, 29, 35, 25, 35, 41, 31, 29, 51,
+ 35, 15, 35, 51, 51, 41, 25, 3, 2, 3,
+ 29, 35, 25, 35, 41, 31, 29, 51, 35, 15,
+ 35, 51, 51, 41, 25, 3, 2, 3, 34, 23,
+ 10, 1, 17, 1, 7, 14, 7, 2, 2, 21,
+ 14, 52, 26, 23, 40, 14, 1, 52, 26, 23,
+ 40, 14, 1, 52, 26, 23, 40, 14, 1, 11,
+ 14, 2, 2, 12, 34, 12, 34, 36, 46, 26,
+ 23, 46, 26, 23, 1, 64, 64, 36, 1, 30,
+ 36, 42, 42, 72, 78, 40, 14, 34, 40, 24,
+ 4, 96, 52, 36, 1, 36, 24, 42, 12, 36,
+ 22, 18, 12, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 17 */
+
+ 14, 5, 14, 5, 43, 1, 48, 5, 15, 1,
+ 1, 78, 1, 5, 1, 1, 1, 30, 33, 20,
+ 37, 27, 17, 4, 4, 14, 14, 21, 32, 20,
+ 27, 11, 10, 21, 14, 49, 78, 14, 36, 1,
+ 5, 5, 27, 33, 21, 33, 37, 27, 27, 49,
+ 33, 11, 33, 49, 49, 37, 21, 1, 4, 1,
+ 27, 33, 21, 33, 37, 27, 27, 49, 33, 11,
+ 33, 49, 49, 37, 21, 1, 4, 1, 36, 21,
+ 14, 1, 17, 1, 5, 14, 5, 4, 4, 17,
+ 14, 52, 26, 21, 42, 14, 1, 52, 26, 21,
+ 42, 14, 1, 52, 26, 21, 42, 14, 1, 11,
+ 14, 4, 4, 14, 36, 14, 36, 36, 46, 26,
+ 21, 46, 26, 21, 1, 64, 64, 36, 1, 30,
+ 36, 42, 42, 74, 78, 42, 14, 36, 42, 26,
+ 4, 96, 52, 36, 1, 36, 26, 42, 14, 36,
+ 26, 20, 14, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 18 */
+
+ 14, 7, 16, 5, 41, 1, 46, 7, 17, 1,
+ 1, 78, 1, 5, 1, 1, 1, 30, 31, 22,
+ 35, 23, 13, 8, 8, 14, 14, 21, 30, 20,
+ 23, 9, 10, 19, 14, 47, 78, 16, 36, 1,
+ 5, 5, 25, 31, 19, 31, 35, 23, 25, 47,
+ 31, 7, 31, 47, 47, 35, 19, 0, 6, 0,
+ 25, 31, 19, 31, 35, 23, 25, 47, 31, 7,
+ 31, 47, 47, 35, 19, 0, 6, 0, 38, 21,
+ 18, 1, 17, 1, 5, 14, 5, 6, 6, 13,
+ 14, 52, 24, 21, 42, 14, 1, 52, 24, 21,
+ 42, 14, 1, 52, 24, 21, 42, 14, 1, 11,
+ 14, 6, 6, 16, 38, 16, 38, 36, 46, 24,
+ 21, 46, 24, 21, 1, 62, 62, 36, 1, 30,
+ 36, 40, 40, 74, 78, 42, 14, 38, 42, 26,
+ 4, 94, 52, 36, 1, 36, 26, 40, 16, 36,
+ 28, 22, 16, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 19 */
+
+ 14, 7, 18, 5, 41, 1, 44, 7, 19, 1,
+ 1, 78, 1, 5, 1, 1, 1, 30, 29, 22,
+ 33, 21, 9, 14, 14, 14, 14, 21, 28, 20,
+ 21, 9, 10, 17, 14, 45, 78, 18, 36, 1,
+ 5, 5, 25, 29, 17, 29, 33, 21, 25, 45,
+ 29, 5, 29, 45, 45, 33, 17, 2, 6, 2,
+ 25, 29, 17, 29, 33, 21, 25, 45, 29, 5,
+ 29, 45, 45, 33, 17, 2, 6, 2, 38, 21,
+ 22, 1, 17, 1, 5, 14, 5, 6, 6, 9,
+ 14, 52, 24, 21, 42, 14, 1, 52, 24, 21,
+ 42, 14, 1, 52, 24, 21, 42, 14, 1, 11,
+ 14, 6, 6, 18, 38, 18, 38, 36, 46, 24,
+ 21, 46, 24, 21, 1, 60, 60, 36, 1, 30,
+ 36, 40, 40, 74, 78, 42, 14, 38, 42, 26,
+ 4, 92, 52, 36, 1, 36, 26, 40, 18, 36,
+ 30, 22, 18, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 20 */
+
+ 14, 9, 20, 3, 39, 1, 42, 9, 21, 1,
+ 1, 78, 1, 3, 1, 1, 1, 30, 27, 24,
+ 31, 17, 5, 18, 18, 14, 14, 19, 26, 18,
+ 17, 7, 12, 15, 14, 43, 78, 20, 34, 1,
+ 3, 3, 23, 27, 15, 27, 31, 17, 23, 43,
+ 27, 1, 27, 43, 43, 31, 15, 4, 8, 4,
+ 23, 27, 15, 27, 31, 17, 23, 43, 27, 1,
+ 27, 43, 43, 31, 15, 4, 8, 4, 40, 19,
+ 26, 1, 17, 1, 3, 14, 3, 8, 8, 5,
+ 14, 50, 22, 19, 44, 14, 1, 50, 22, 19,
+ 44, 14, 1, 50, 22, 19, 44, 14, 1, 13,
+ 14, 8, 8, 20, 40, 20, 40, 34, 46, 22,
+ 19, 46, 22, 19, 1, 58, 58, 34, 1, 30,
+ 34, 38, 38, 76, 78, 44, 14, 40, 44, 28,
+ 2, 90, 50, 34, 1, 34, 28, 38, 20, 34,
+ 32, 24, 20, 34,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 21 */
+
+ 14, 11, 22, 3, 37, 1, 40, 11, 23, 1,
+ 1, 78, 1, 3, 1, 1, 1, 30, 25, 26,
+ 27, 15, 1, 24, 24, 14, 14, 19, 24, 18,
+ 15, 5, 12, 11, 14, 41, 78, 22, 34, 1,
+ 3, 3, 21, 25, 11, 25, 27, 15, 21, 41,
+ 25, 0, 25, 41, 41, 27, 11, 6, 10, 6,
+ 21, 25, 11, 25, 27, 15, 21, 41, 25, 0,
+ 25, 41, 41, 27, 11, 6, 10, 6, 42, 19,
+ 30, 1, 17, 1, 3, 14, 3, 10, 10, 1,
+ 14, 50, 20, 19, 44, 14, 1, 50, 20, 19,
+ 44, 14, 1, 50, 20, 19, 44, 14, 1, 13,
+ 14, 10, 10, 22, 42, 22, 42, 34, 46, 20,
+ 19, 46, 20, 19, 1, 56, 56, 34, 1, 30,
+ 34, 36, 36, 76, 78, 44, 14, 42, 44, 28,
+ 2, 88, 50, 34, 1, 34, 28, 36, 22, 34,
+ 36, 26, 22, 34,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 22 */
+
+ 14, 11, 24, 3, 37, 1, 38, 11, 25, 1,
+ 1, 78, 1, 3, 1, 1, 1, 30, 23, 26,
+ 25, 11, 2, 28, 28, 14, 14, 19, 22, 18,
+ 11, 5, 12, 9, 14, 39, 78, 24, 34, 1,
+ 3, 3, 21, 23, 9, 23, 25, 11, 21, 39,
+ 23, 4, 23, 39, 39, 25, 9, 8, 10, 8,
+ 21, 23, 9, 23, 25, 11, 21, 39, 23, 4,
+ 23, 39, 39, 25, 9, 8, 10, 8, 42, 19,
+ 34, 1, 17, 1, 3, 14, 3, 10, 10, 2,
+ 14, 50, 20, 19, 44, 14, 1, 50, 20, 19,
+ 44, 14, 1, 50, 20, 19, 44, 14, 1, 13,
+ 14, 10, 10, 24, 42, 24, 42, 34, 46, 20,
+ 19, 46, 20, 19, 1, 54, 54, 34, 1, 30,
+ 34, 36, 36, 76, 78, 44, 14, 42, 44, 28,
+ 2, 86, 50, 34, 1, 34, 28, 36, 24, 34,
+ 38, 26, 24, 34,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 23 */
+
+ 14, 13, 26, 1, 35, 1, 36, 13, 27, 1,
+ 1, 78, 1, 1, 1, 1, 1, 30, 21, 28,
+ 23, 9, 6, 34, 34, 14, 14, 17, 20, 16,
+ 9, 3, 14, 7, 14, 37, 78, 26, 32, 1,
+ 1, 1, 19, 21, 7, 21, 23, 9, 19, 37,
+ 21, 6, 21, 37, 37, 23, 7, 10, 12, 10,
+ 19, 21, 7, 21, 23, 9, 19, 37, 21, 6,
+ 21, 37, 37, 23, 7, 10, 12, 10, 44, 17,
+ 38, 1, 17, 1, 1, 14, 1, 12, 12, 6,
+ 14, 48, 18, 17, 46, 14, 1, 48, 18, 17,
+ 46, 14, 1, 48, 18, 17, 46, 14, 1, 15,
+ 14, 12, 12, 26, 44, 26, 44, 32, 46, 18,
+ 17, 46, 18, 17, 1, 52, 52, 32, 1, 30,
+ 32, 34, 34, 78, 78, 46, 14, 44, 46, 30,
+ 0, 84, 48, 32, 1, 32, 30, 34, 26, 32,
+ 40, 28, 26, 32,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 24 */
+
+ 14, 15, 28, 1, 35, 1, 34, 15, 29, 1,
+ 1, 78, 1, 1, 1, 1, 1, 30, 19, 28,
+ 21, 5, 8, 38, 38, 14, 14, 17, 18, 16,
+ 5, 3, 14, 5, 14, 35, 78, 28, 32, 1,
+ 1, 1, 19, 19, 5, 19, 21, 5, 19, 35,
+ 19, 10, 19, 35, 35, 21, 5, 12, 12, 12,
+ 19, 19, 5, 19, 21, 5, 19, 35, 19, 10,
+ 19, 35, 35, 21, 5, 12, 12, 12, 44, 17,
+ 40, 1, 17, 1, 1, 14, 1, 12, 12, 8,
+ 14, 48, 16, 17, 46, 14, 1, 48, 16, 17,
+ 46, 14, 1, 48, 16, 17, 46, 14, 1, 15,
+ 14, 12, 12, 28, 44, 28, 44, 32, 46, 16,
+ 17, 46, 16, 17, 1, 50, 50, 32, 1, 30,
+ 32, 32, 32, 78, 78, 46, 14, 44, 46, 30,
+ 0, 82, 48, 32, 1, 32, 30, 32, 28, 32,
+ 42, 28, 28, 32,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 25 */
+
+ 14, 15, 30, 1, 33, 1, 32, 15, 31, 1,
+ 1, 78, 1, 1, 1, 1, 1, 30, 17, 30,
+ 17, 1, 12, 44, 44, 14, 14, 17, 16, 16,
+ 1, 1, 14, 1, 14, 33, 78, 30, 32, 1,
+ 1, 1, 17, 17, 1, 17, 17, 1, 17, 33,
+ 17, 14, 17, 33, 33, 17, 1, 14, 14, 14,
+ 17, 17, 1, 17, 17, 1, 17, 33, 17, 14,
+ 17, 33, 33, 17, 1, 14, 14, 14, 46, 17,
+ 44, 1, 17, 1, 1, 14, 1, 14, 14, 12,
+ 14, 48, 16, 17, 46, 14, 1, 48, 16, 17,
+ 46, 14, 1, 48, 16, 17, 46, 14, 1, 15,
+ 14, 14, 14, 30, 46, 30, 46, 32, 46, 16,
+ 17, 46, 16, 17, 1, 48, 48, 32, 1, 30,
+ 32, 32, 32, 78, 78, 46, 14, 46, 46, 30,
+ 0, 80, 48, 32, 1, 32, 30, 32, 30, 32,
+ 46, 30, 30, 32,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 26 */
+
+ 14, 17, 32, 0, 31, 1, 30, 17, 33, 1,
+ 1, 78, 1, 0, 1, 1, 1, 30, 15, 32,
+ 15, 0, 16, 48, 48, 14, 14, 15, 14, 14,
+ 0, 0, 16, 0, 14, 31, 78, 32, 30, 1,
+ 0, 0, 15, 15, 0, 15, 15, 0, 15, 31,
+ 15, 16, 15, 31, 31, 15, 0, 16, 16, 16,
+ 15, 15, 0, 15, 15, 0, 15, 31, 15, 16,
+ 15, 31, 31, 15, 0, 16, 16, 16, 48, 15,
+ 48, 1, 17, 1, 0, 14, 0, 16, 16, 16,
+ 14, 46, 14, 15, 48, 14, 1, 46, 14, 15,
+ 48, 14, 1, 46, 14, 15, 48, 14, 1, 17,
+ 14, 16, 16, 32, 48, 32, 48, 30, 46, 14,
+ 15, 46, 14, 15, 1, 46, 46, 30, 1, 30,
+ 30, 30, 30, 80, 78, 48, 14, 48, 48, 32,
+ 1, 78, 46, 30, 1, 30, 32, 30, 32, 30,
+ 48, 32, 32, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 27 */
+
+ 14, 17, 34, 0, 31, 1, 28, 17, 35, 1,
+ 1, 78, 1, 0, 1, 1, 1, 30, 13, 32,
+ 13, 4, 20, 54, 54, 14, 14, 15, 12, 14,
+ 4, 0, 16, 2, 14, 29, 78, 34, 30, 1,
+ 0, 0, 15, 13, 2, 13, 13, 4, 15, 29,
+ 13, 20, 13, 29, 29, 13, 2, 18, 16, 18,
+ 15, 13, 2, 13, 13, 4, 15, 29, 13, 20,
+ 13, 29, 29, 13, 2, 18, 16, 18, 48, 15,
+ 52, 1, 17, 1, 0, 14, 0, 16, 16, 20,
+ 14, 46, 14, 15, 48, 14, 1, 46, 14, 15,
+ 48, 14, 1, 46, 14, 15, 48, 14, 1, 17,
+ 14, 16, 16, 34, 48, 34, 48, 30, 46, 14,
+ 15, 46, 14, 15, 1, 44, 44, 30, 1, 30,
+ 30, 30, 30, 80, 78, 48, 14, 48, 48, 32,
+ 1, 76, 46, 30, 1, 30, 32, 30, 34, 30,
+ 50, 32, 34, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 28 */
+
+ 14, 19, 36, 0, 29, 1, 26, 19, 37, 1,
+ 1, 78, 1, 0, 1, 1, 1, 30, 11, 34,
+ 11, 6, 24, 58, 58, 14, 14, 15, 10, 14,
+ 6, 2, 16, 4, 14, 27, 78, 36, 30, 1,
+ 0, 0, 13, 11, 4, 11, 11, 6, 13, 27,
+ 11, 22, 11, 27, 27, 11, 4, 20, 18, 20,
+ 13, 11, 4, 11, 11, 6, 13, 27, 11, 22,
+ 11, 27, 27, 11, 4, 20, 18, 20, 50, 15,
+ 56, 1, 17, 1, 0, 14, 0, 18, 18, 24,
+ 14, 46, 12, 15, 48, 14, 1, 46, 12, 15,
+ 48, 14, 1, 46, 12, 15, 48, 14, 1, 17,
+ 14, 18, 18, 36, 50, 36, 50, 30, 46, 12,
+ 15, 46, 12, 15, 1, 42, 42, 30, 1, 30,
+ 30, 28, 28, 80, 78, 48, 14, 50, 48, 32,
+ 1, 74, 46, 30, 1, 30, 32, 28, 36, 30,
+ 52, 34, 36, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 29 */
+
+ 14, 21, 38, 2, 27, 1, 24, 21, 39, 1,
+ 1, 78, 1, 2, 1, 1, 1, 30, 9, 36,
+ 7, 10, 28, 64, 64, 14, 14, 13, 8, 12,
+ 10, 4, 18, 8, 14, 25, 78, 38, 28, 1,
+ 2, 2, 11, 9, 8, 9, 7, 10, 11, 25,
+ 9, 26, 9, 25, 25, 7, 8, 22, 20, 22,
+ 11, 9, 8, 9, 7, 10, 11, 25, 9, 26,
+ 9, 25, 25, 7, 8, 22, 20, 22, 52, 13,
+ 60, 1, 17, 1, 2, 14, 2, 20, 20, 28,
+ 14, 44, 10, 13, 50, 14, 1, 44, 10, 13,
+ 50, 14, 1, 44, 10, 13, 50, 14, 1, 19,
+ 14, 20, 20, 38, 52, 38, 52, 28, 46, 10,
+ 13, 46, 10, 13, 1, 40, 40, 28, 1, 30,
+ 28, 26, 26, 82, 78, 50, 14, 52, 50, 34,
+ 3, 72, 44, 28, 1, 28, 34, 26, 38, 28,
+ 56, 36, 38, 28,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 30 */
+
+ 14, 21, 40, 2, 27, 1, 22, 21, 41, 1,
+ 1, 78, 1, 2, 1, 1, 1, 30, 7, 36,
+ 5, 12, 32, 68, 68, 14, 14, 13, 6, 12,
+ 12, 4, 18, 10, 14, 23, 78, 40, 28, 1,
+ 2, 2, 11, 7, 10, 7, 5, 12, 11, 23,
+ 7, 28, 7, 23, 23, 5, 10, 24, 20, 24,
+ 11, 7, 10, 7, 5, 12, 11, 23, 7, 28,
+ 7, 23, 23, 5, 10, 24, 20, 24, 52, 13,
+ 64, 1, 17, 1, 2, 14, 2, 20, 20, 32,
+ 14, 44, 10, 13, 50, 14, 1, 44, 10, 13,
+ 50, 14, 1, 44, 10, 13, 50, 14, 1, 19,
+ 14, 20, 20, 40, 52, 40, 52, 28, 46, 10,
+ 13, 46, 10, 13, 1, 38, 38, 28, 1, 30,
+ 28, 26, 26, 82, 78, 50, 14, 52, 50, 34,
+ 3, 70, 44, 28, 1, 28, 34, 26, 40, 28,
+ 58, 36, 40, 28,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 31 */
+
+ 14, 23, 42, 2, 25, 1, 20, 23, 43, 1,
+ 1, 78, 1, 2, 1, 1, 1, 30, 5, 38,
+ 3, 16, 36, 74, 74, 14, 14, 13, 4, 12,
+ 16, 6, 18, 12, 14, 21, 78, 42, 28, 1,
+ 2, 2, 9, 5, 12, 5, 3, 16, 9, 21,
+ 5, 32, 5, 21, 21, 3, 12, 26, 22, 26,
+ 9, 5, 12, 5, 3, 16, 9, 21, 5, 32,
+ 5, 21, 21, 3, 12, 26, 22, 26, 54, 13,
+ 68, 1, 17, 1, 2, 14, 2, 22, 22, 36,
+ 14, 44, 8, 13, 50, 14, 1, 44, 8, 13,
+ 50, 14, 1, 44, 8, 13, 50, 14, 1, 19,
+ 14, 22, 22, 42, 54, 42, 54, 28, 46, 8,
+ 13, 46, 8, 13, 1, 36, 36, 28, 1, 30,
+ 28, 24, 24, 82, 78, 50, 14, 54, 50, 34,
+ 3, 68, 44, 28, 1, 28, 34, 24, 42, 28,
+ 60, 38, 42, 28,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 32 */
+
+ 14, 25, 42, 2, 25, 1, 18, 25, 45, 1,
+ 1, 78, 1, 2, 1, 1, 1, 30, 5, 38,
+ 1, 18, 38, 78, 78, 14, 14, 13, 2, 10,
+ 18, 6, 18, 14, 14, 21, 78, 42, 26, 1,
+ 2, 2, 9, 5, 14, 5, 1, 18, 9, 21,
+ 5, 34, 5, 21, 21, 1, 14, 26, 22, 26,
+ 9, 5, 14, 5, 1, 18, 9, 21, 5, 34,
+ 5, 21, 21, 1, 14, 26, 22, 26, 54, 13,
+ 70, 1, 17, 1, 2, 14, 2, 22, 22, 38,
+ 14, 42, 6, 13, 50, 14, 1, 42, 6, 13,
+ 50, 14, 1, 42, 6, 13, 50, 14, 1, 21,
+ 14, 22, 22, 42, 54, 42, 54, 26, 46, 6,
+ 13, 46, 6, 13, 1, 34, 34, 26, 1, 30,
+ 26, 22, 22, 82, 78, 50, 14, 54, 50, 34,
+ 5, 66, 42, 26, 1, 26, 34, 22, 42, 26,
+ 62, 38, 42, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 33 */
+
+ 14, 25, 44, 4, 23, 1, 18, 25, 45, 1,
+ 1, 78, 1, 4, 1, 1, 1, 30, 3, 40,
+ 2, 22, 42, 84, 84, 14, 14, 11, 2, 10,
+ 22, 8, 20, 18, 14, 19, 78, 44, 26, 1,
+ 4, 4, 7, 3, 18, 3, 2, 22, 7, 19,
+ 3, 38, 3, 19, 19, 2, 18, 28, 24, 28,
+ 7, 3, 18, 3, 2, 22, 7, 19, 3, 38,
+ 3, 19, 19, 2, 18, 28, 24, 28, 56, 11,
+ 74, 1, 17, 1, 4, 14, 4, 24, 24, 42,
+ 14, 42, 6, 11, 52, 14, 1, 42, 6, 11,
+ 52, 14, 1, 42, 6, 11, 52, 14, 1, 21,
+ 14, 24, 24, 44, 56, 44, 56, 26, 46, 6,
+ 11, 46, 6, 11, 1, 34, 34, 26, 1, 30,
+ 26, 22, 22, 84, 78, 52, 14, 56, 52, 36,
+ 5, 66, 42, 26, 1, 26, 36, 22, 44, 26,
+ 66, 40, 44, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 34 */
+
+ 14, 27, 46, 4, 21, 1, 16, 27, 47, 1,
+ 1, 78, 1, 4, 1, 1, 1, 30, 1, 42,
+ 4, 26, 46, 88, 88, 14, 14, 11, 0, 10,
+ 26, 10, 20, 20, 14, 17, 78, 46, 26, 1,
+ 4, 4, 5, 1, 20, 1, 4, 26, 5, 17,
+ 1, 42, 1, 17, 17, 4, 20, 30, 26, 30,
+ 5, 1, 20, 1, 4, 26, 5, 17, 1, 42,
+ 1, 17, 17, 4, 20, 30, 26, 30, 58, 11,
+ 78, 1, 17, 1, 4, 14, 4, 26, 26, 46,
+ 14, 42, 4, 11, 52, 14, 1, 42, 4, 11,
+ 52, 14, 1, 42, 4, 11, 52, 14, 1, 21,
+ 14, 26, 26, 46, 58, 46, 58, 26, 46, 4,
+ 11, 46, 4, 11, 1, 32, 32, 26, 1, 30,
+ 26, 20, 20, 84, 78, 52, 14, 58, 52, 36,
+ 5, 64, 42, 26, 1, 26, 36, 20, 46, 26,
+ 68, 42, 46, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 35 */
+
+ 14, 27, 48, 4, 21, 1, 14, 27, 49, 1,
+ 1, 78, 1, 4, 1, 1, 1, 30, 0, 42,
+ 6, 28, 50, 94, 94, 14, 14, 11, 1, 10,
+ 28, 10, 20, 22, 14, 15, 78, 48, 26, 1,
+ 4, 4, 5, 0, 22, 0, 6, 28, 5, 15,
+ 0, 44, 0, 15, 15, 6, 22, 32, 26, 32,
+ 5, 0, 22, 0, 6, 28, 5, 15, 0, 44,
+ 0, 15, 15, 6, 22, 32, 26, 32, 58, 11,
+ 82, 1, 17, 1, 4, 14, 4, 26, 26, 50,
+ 14, 42, 4, 11, 52, 14, 1, 42, 4, 11,
+ 52, 14, 1, 42, 4, 11, 52, 14, 1, 21,
+ 14, 26, 26, 48, 58, 48, 58, 26, 46, 4,
+ 11, 46, 4, 11, 1, 30, 30, 26, 1, 30,
+ 26, 20, 20, 84, 78, 52, 14, 58, 52, 36,
+ 5, 62, 42, 26, 1, 26, 36, 20, 48, 26,
+ 70, 42, 48, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 36 */
+
+ 14, 29, 50, 6, 19, 1, 12, 29, 51, 1,
+ 1, 78, 1, 6, 1, 1, 1, 30, 2, 44,
+ 8, 32, 54, 98, 98, 14, 14, 9, 3, 8,
+ 32, 12, 22, 24, 14, 13, 78, 50, 24, 1,
+ 6, 6, 3, 2, 24, 2, 8, 32, 3, 13,
+ 2, 48, 2, 13, 13, 8, 24, 34, 28, 34,
+ 3, 2, 24, 2, 8, 32, 3, 13, 2, 48,
+ 2, 13, 13, 8, 24, 34, 28, 34, 60, 9,
+ 86, 1, 17, 1, 6, 14, 6, 28, 28, 54,
+ 14, 40, 2, 9, 54, 14, 1, 40, 2, 9,
+ 54, 14, 1, 40, 2, 9, 54, 14, 1, 23,
+ 14, 28, 28, 50, 60, 50, 60, 24, 46, 2,
+ 9, 46, 2, 9, 1, 28, 28, 24, 1, 30,
+ 24, 18, 18, 86, 78, 54, 14, 60, 54, 38,
+ 7, 60, 40, 24, 1, 24, 38, 18, 50, 24,
+ 72, 44, 50, 24,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 37 */
+
+ 14, 31, 52, 6, 17, 1, 10, 31, 53, 1,
+ 1, 78, 1, 6, 1, 1, 1, 30, 4, 46,
+ 12, 34, 58, 104, 104, 14, 14, 9, 5, 8,
+ 34, 14, 22, 28, 14, 11, 78, 52, 24, 1,
+ 6, 6, 1, 4, 28, 4, 12, 34, 1, 11,
+ 4, 50, 4, 11, 11, 12, 28, 36, 30, 36,
+ 1, 4, 28, 4, 12, 34, 1, 11, 4, 50,
+ 4, 11, 11, 12, 28, 36, 30, 36, 62, 9,
+ 90, 1, 17, 1, 6, 14, 6, 30, 30, 58,
+ 14, 40, 0, 9, 54, 14, 1, 40, 0, 9,
+ 54, 14, 1, 40, 0, 9, 54, 14, 1, 23,
+ 14, 30, 30, 52, 62, 52, 62, 24, 46, 0,
+ 9, 46, 0, 9, 1, 26, 26, 24, 1, 30,
+ 24, 16, 16, 86, 78, 54, 14, 62, 54, 38,
+ 7, 58, 40, 24, 1, 24, 38, 16, 52, 24,
+ 76, 46, 52, 24,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 38 */
+
+ 14, 31, 54, 6, 17, 1, 8, 31, 55, 1,
+ 1, 78, 1, 6, 1, 1, 1, 30, 6, 46,
+ 14, 38, 62, 108, 108, 14, 14, 9, 7, 8,
+ 38, 14, 22, 30, 14, 9, 78, 54, 24, 1,
+ 6, 6, 1, 6, 30, 6, 14, 38, 1, 9,
+ 6, 54, 6, 9, 9, 14, 30, 38, 30, 38,
+ 1, 6, 30, 6, 14, 38, 1, 9, 6, 54,
+ 6, 9, 9, 14, 30, 38, 30, 38, 62, 9,
+ 94, 1, 17, 1, 6, 14, 6, 30, 30, 62,
+ 14, 40, 0, 9, 54, 14, 1, 40, 0, 9,
+ 54, 14, 1, 40, 0, 9, 54, 14, 1, 23,
+ 14, 30, 30, 54, 62, 54, 62, 24, 46, 0,
+ 9, 46, 0, 9, 1, 24, 24, 24, 1, 30,
+ 24, 16, 16, 86, 78, 54, 14, 62, 54, 38,
+ 7, 56, 40, 24, 1, 24, 38, 16, 54, 24,
+ 78, 46, 54, 24,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 39 */
+
+ 14, 33, 56, 8, 15, 1, 6, 33, 57, 1,
+ 1, 78, 1, 8, 1, 1, 1, 30, 8, 48,
+ 16, 40, 66, 114, 114, 14, 14, 7, 9, 6,
+ 40, 16, 24, 32, 14, 7, 78, 56, 22, 1,
+ 8, 8, 0, 8, 32, 8, 16, 40, 0, 7,
+ 8, 56, 8, 7, 7, 16, 32, 40, 32, 40,
+ 0, 8, 32, 8, 16, 40, 0, 7, 8, 56,
+ 8, 7, 7, 16, 32, 40, 32, 40, 64, 7,
+ 98, 1, 17, 1, 8, 14, 8, 32, 32, 66,
+ 14, 38, 1, 7, 56, 14, 1, 38, 1, 7,
+ 56, 14, 1, 38, 1, 7, 56, 14, 1, 25,
+ 14, 32, 32, 56, 64, 56, 64, 22, 46, 1,
+ 7, 46, 1, 7, 1, 22, 22, 22, 1, 30,
+ 22, 14, 14, 88, 78, 56, 14, 64, 56, 40,
+ 9, 54, 38, 22, 1, 22, 40, 14, 56, 22,
+ 80, 48, 56, 22,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 40 */
+
+ 14, 35, 58, 8, 15, 1, 4, 35, 59, 1,
+ 1, 78, 1, 8, 1, 1, 1, 30, 10, 48,
+ 18, 44, 68, 118, 118, 14, 14, 7, 11, 6,
+ 44, 16, 24, 34, 14, 5, 78, 58, 22, 1,
+ 8, 8, 0, 10, 34, 10, 18, 44, 0, 5,
+ 10, 60, 10, 5, 5, 18, 34, 42, 32, 42,
+ 0, 10, 34, 10, 18, 44, 0, 5, 10, 60,
+ 10, 5, 5, 18, 34, 42, 32, 42, 64, 7,
+ 100, 1, 17, 1, 8, 14, 8, 32, 32, 68,
+ 14, 38, 3, 7, 56, 14, 1, 38, 3, 7,
+ 56, 14, 1, 38, 3, 7, 56, 14, 1, 25,
+ 14, 32, 32, 58, 64, 58, 64, 22, 46, 3,
+ 7, 46, 3, 7, 1, 20, 20, 22, 1, 30,
+ 22, 12, 12, 88, 78, 56, 14, 64, 56, 40,
+ 9, 52, 38, 22, 1, 22, 40, 12, 58, 22,
+ 82, 48, 58, 22,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 41 */
+
+ 14, 35, 60, 8, 13, 1, 2, 35, 61, 1,
+ 1, 78, 1, 8, 1, 1, 1, 30, 12, 50,
+ 22, 48, 72, 124, 124, 14, 14, 7, 13, 6,
+ 48, 18, 24, 38, 14, 3, 78, 60, 22, 1,
+ 8, 8, 2, 12, 38, 12, 22, 48, 2, 3,
+ 12, 64, 12, 3, 3, 22, 38, 44, 34, 44,
+ 2, 12, 38, 12, 22, 48, 2, 3, 12, 64,
+ 12, 3, 3, 22, 38, 44, 34, 44, 66, 7,
+ 104, 1, 17, 1, 8, 14, 8, 34, 34, 72,
+ 14, 38, 3, 7, 56, 14, 1, 38, 3, 7,
+ 56, 14, 1, 38, 3, 7, 56, 14, 1, 25,
+ 14, 34, 34, 60, 66, 60, 66, 22, 46, 3,
+ 7, 46, 3, 7, 1, 18, 18, 22, 1, 30,
+ 22, 12, 12, 88, 78, 56, 14, 66, 56, 40,
+ 9, 50, 38, 22, 1, 22, 40, 12, 60, 22,
+ 86, 50, 60, 22,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 42 */
+
+ 14, 37, 62, 10, 11, 1, 0, 37, 63, 1,
+ 1, 78, 1, 10, 1, 1, 1, 30, 14, 52,
+ 24, 50, 76, 124, 124, 14, 14, 5, 15, 4,
+ 50, 20, 26, 40, 14, 1, 78, 62, 20, 1,
+ 10, 10, 4, 14, 40, 14, 24, 50, 4, 1,
+ 14, 66, 14, 1, 1, 24, 40, 46, 36, 46,
+ 4, 14, 40, 14, 24, 50, 4, 1, 14, 66,
+ 14, 1, 1, 24, 40, 46, 36, 46, 68, 5,
+ 108, 1, 17, 1, 10, 14, 10, 36, 36, 76,
+ 14, 36, 5, 5, 58, 14, 1, 36, 5, 5,
+ 58, 14, 1, 36, 5, 5, 58, 14, 1, 27,
+ 14, 36, 36, 62, 68, 62, 68, 20, 46, 5,
+ 5, 46, 5, 5, 1, 16, 16, 20, 1, 30,
+ 20, 10, 10, 90, 78, 58, 14, 68, 58, 42,
+ 11, 48, 36, 20, 1, 20, 42, 10, 62, 20,
+ 88, 52, 62, 20,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 43 */
+
+ 14, 37, 64, 10, 11, 1, 1, 37, 65, 1,
+ 1, 78, 1, 10, 1, 1, 1, 30, 16, 52,
+ 26, 54, 80, 124, 124, 14, 14, 5, 17, 4,
+ 54, 20, 26, 42, 14, 0, 78, 64, 20, 1,
+ 10, 10, 4, 16, 42, 16, 26, 54, 4, 0,
+ 16, 70, 16, 0, 0, 26, 42, 48, 36, 48,
+ 4, 16, 42, 16, 26, 54, 4, 0, 16, 70,
+ 16, 0, 0, 26, 42, 48, 36, 48, 68, 5,
+ 112, 1, 17, 1, 10, 14, 10, 36, 36, 80,
+ 14, 36, 5, 5, 58, 14, 1, 36, 5, 5,
+ 58, 14, 1, 36, 5, 5, 58, 14, 1, 27,
+ 14, 36, 36, 64, 68, 64, 68, 20, 46, 5,
+ 5, 46, 5, 5, 1, 14, 14, 20, 1, 30,
+ 20, 10, 10, 90, 78, 58, 14, 68, 58, 42,
+ 11, 46, 36, 20, 1, 20, 42, 10, 64, 20,
+ 90, 52, 64, 20,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 44 */
+
+ 14, 39, 66, 10, 9, 1, 3, 39, 67, 1,
+ 1, 78, 1, 10, 1, 1, 1, 30, 18, 54,
+ 28, 56, 84, 124, 124, 14, 14, 5, 19, 4,
+ 56, 22, 26, 44, 14, 2, 78, 66, 20, 1,
+ 10, 10, 6, 18, 44, 18, 28, 56, 6, 2,
+ 18, 72, 18, 2, 2, 28, 44, 50, 38, 50,
+ 6, 18, 44, 18, 28, 56, 6, 2, 18, 72,
+ 18, 2, 2, 28, 44, 50, 38, 50, 70, 5,
+ 116, 1, 17, 1, 10, 14, 10, 38, 38, 84,
+ 14, 36, 7, 5, 58, 14, 1, 36, 7, 5,
+ 58, 14, 1, 36, 7, 5, 58, 14, 1, 27,
+ 14, 38, 38, 66, 70, 66, 70, 20, 46, 7,
+ 5, 46, 7, 5, 1, 12, 12, 20, 1, 30,
+ 20, 8, 8, 90, 78, 58, 14, 70, 58, 42,
+ 11, 44, 36, 20, 1, 20, 42, 8, 66, 20,
+ 92, 54, 66, 20,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 45 */
+
+ 14, 41, 68, 12, 7, 1, 5, 41, 69, 1,
+ 1, 78, 1, 12, 1, 1, 1, 30, 20, 56,
+ 32, 60, 88, 124, 124, 14, 14, 3, 21, 2,
+ 60, 24, 28, 48, 14, 4, 78, 68, 18, 1,
+ 12, 12, 8, 20, 48, 20, 32, 60, 8, 4,
+ 20, 76, 20, 4, 4, 32, 48, 52, 40, 52,
+ 8, 20, 48, 20, 32, 60, 8, 4, 20, 76,
+ 20, 4, 4, 32, 48, 52, 40, 52, 72, 3,
+ 120, 1, 17, 1, 12, 14, 12, 40, 40, 88,
+ 14, 34, 9, 3, 60, 14, 1, 34, 9, 3,
+ 60, 14, 1, 34, 9, 3, 60, 14, 1, 29,
+ 14, 40, 40, 68, 72, 68, 72, 18, 46, 9,
+ 3, 46, 9, 3, 1, 10, 10, 18, 1, 30,
+ 18, 6, 6, 92, 78, 60, 14, 72, 60, 44,
+ 13, 42, 34, 18, 1, 18, 44, 6, 68, 18,
+ 96, 56, 68, 18,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 46 */
+
+ 14, 41, 70, 12, 7, 1, 7, 41, 71, 1,
+ 1, 78, 1, 12, 1, 1, 1, 30, 22, 56,
+ 34, 62, 92, 124, 124, 14, 14, 3, 23, 2,
+ 62, 24, 28, 50, 14, 6, 78, 70, 18, 1,
+ 12, 12, 8, 22, 50, 22, 34, 62, 8, 6,
+ 22, 78, 22, 6, 6, 34, 50, 54, 40, 54,
+ 8, 22, 50, 22, 34, 62, 8, 6, 22, 78,
+ 22, 6, 6, 34, 50, 54, 40, 54, 72, 3,
+ 124, 1, 17, 1, 12, 14, 12, 40, 40, 92,
+ 14, 34, 9, 3, 60, 14, 1, 34, 9, 3,
+ 60, 14, 1, 34, 9, 3, 60, 14, 1, 29,
+ 14, 40, 40, 70, 72, 70, 72, 18, 46, 9,
+ 3, 46, 9, 3, 1, 8, 8, 18, 1, 30,
+ 18, 6, 6, 92, 78, 60, 14, 72, 60, 44,
+ 13, 40, 34, 18, 1, 18, 44, 6, 70, 18,
+ 98, 56, 70, 18,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 47 */
+
+ 14, 43, 72, 12, 5, 1, 9, 43, 73, 1,
+ 1, 78, 1, 12, 1, 1, 1, 30, 24, 58,
+ 36, 66, 96, 124, 124, 14, 14, 3, 25, 2,
+ 66, 26, 28, 52, 14, 8, 78, 72, 18, 1,
+ 12, 12, 10, 24, 52, 24, 36, 66, 10, 8,
+ 24, 82, 24, 8, 8, 36, 52, 56, 42, 56,
+ 10, 24, 52, 24, 36, 66, 10, 8, 24, 82,
+ 24, 8, 8, 36, 52, 56, 42, 56, 74, 3,
+ 124, 1, 17, 1, 12, 14, 12, 42, 42, 96,
+ 14, 34, 11, 3, 60, 14, 1, 34, 11, 3,
+ 60, 14, 1, 34, 11, 3, 60, 14, 1, 29,
+ 14, 42, 42, 72, 74, 72, 74, 18, 46, 11,
+ 3, 46, 11, 3, 1, 6, 6, 18, 1, 30,
+ 18, 4, 4, 92, 78, 60, 14, 74, 60, 44,
+ 13, 38, 34, 18, 1, 18, 44, 4, 72, 18,
+ 100, 58, 72, 18,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 48 */
+
+ 14, 45, 72, 12, 5, 1, 11, 45, 75, 1,
+ 1, 78, 1, 12, 1, 1, 1, 30, 24, 58,
+ 38, 68, 98, 124, 124, 14, 14, 3, 27, 0,
+ 68, 26, 28, 54, 14, 8, 78, 72, 16, 1,
+ 12, 12, 10, 24, 54, 24, 38, 68, 10, 8,
+ 24, 84, 24, 8, 8, 38, 54, 56, 42, 56,
+ 10, 24, 54, 24, 38, 68, 10, 8, 24, 84,
+ 24, 8, 8, 38, 54, 56, 42, 56, 74, 3,
+ 124, 1, 17, 1, 12, 14, 12, 42, 42, 98,
+ 14, 32, 13, 3, 60, 14, 1, 32, 13, 3,
+ 60, 14, 1, 32, 13, 3, 60, 14, 1, 31,
+ 14, 42, 42, 72, 74, 72, 74, 16, 46, 13,
+ 3, 46, 13, 3, 1, 4, 4, 16, 1, 30,
+ 16, 2, 2, 92, 78, 60, 14, 74, 60, 44,
+ 15, 36, 32, 16, 1, 16, 44, 2, 72, 16,
+ 102, 58, 72, 16,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 49 */
+
+ 14, 45, 74, 14, 3, 1, 11, 45, 75, 1,
+ 1, 78, 1, 14, 1, 1, 1, 30, 26, 60,
+ 42, 72, 102, 124, 124, 14, 14, 1, 27, 0,
+ 72, 28, 30, 58, 14, 10, 78, 74, 16, 1,
+ 14, 14, 12, 26, 58, 26, 42, 72, 12, 10,
+ 26, 88, 26, 10, 10, 42, 58, 58, 44, 58,
+ 12, 26, 58, 26, 42, 72, 12, 10, 26, 88,
+ 26, 10, 10, 42, 58, 58, 44, 58, 76, 1,
+ 124, 1, 17, 1, 14, 14, 14, 44, 44, 102,
+ 14, 32, 13, 1, 62, 14, 1, 32, 13, 1,
+ 62, 14, 1, 32, 13, 1, 62, 14, 1, 31,
+ 14, 44, 44, 74, 76, 74, 76, 16, 46, 13,
+ 1, 46, 13, 1, 1, 4, 4, 16, 1, 30,
+ 16, 2, 2, 94, 78, 62, 14, 76, 62, 46,
+ 15, 36, 32, 16, 1, 16, 46, 2, 74, 16,
+ 106, 60, 74, 16,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 50 */
+
+ 14, 47, 76, 14, 1, 1, 13, 47, 77, 1,
+ 1, 78, 1, 14, 1, 1, 1, 30, 28, 62,
+ 44, 76, 106, 124, 124, 14, 14, 1, 29, 0,
+ 76, 30, 30, 60, 14, 12, 78, 76, 16, 1,
+ 14, 14, 14, 28, 60, 28, 44, 76, 14, 12,
+ 28, 92, 28, 12, 12, 44, 60, 60, 46, 60,
+ 14, 28, 60, 28, 44, 76, 14, 12, 28, 92,
+ 28, 12, 12, 44, 60, 60, 46, 60, 78, 1,
+ 124, 1, 17, 1, 14, 14, 14, 46, 46, 106,
+ 14, 32, 15, 1, 62, 14, 1, 32, 15, 1,
+ 62, 14, 1, 32, 15, 1, 62, 14, 1, 31,
+ 14, 46, 46, 76, 78, 76, 78, 16, 46, 15,
+ 1, 46, 15, 1, 1, 2, 2, 16, 1, 30,
+ 16, 0, 0, 94, 78, 62, 14, 78, 62, 46,
+ 15, 34, 32, 16, 1, 16, 46, 0, 76, 16,
+ 108, 62, 76, 16,
+ },
+
+ {
+ /* Context Tables for init_idc = 1, qp = 51 */
+
+ 14, 47, 78, 14, 1, 1, 15, 47, 79, 1,
+ 1, 78, 1, 14, 1, 1, 1, 30, 30, 62,
+ 46, 78, 110, 124, 124, 14, 14, 1, 31, 0,
+ 78, 30, 30, 62, 14, 14, 78, 78, 16, 1,
+ 14, 14, 14, 30, 62, 30, 46, 78, 14, 14,
+ 30, 94, 30, 14, 14, 46, 62, 62, 46, 62,
+ 14, 30, 62, 30, 46, 78, 14, 14, 30, 94,
+ 30, 14, 14, 46, 62, 62, 46, 62, 78, 1,
+ 124, 1, 17, 1, 14, 14, 14, 46, 46, 110,
+ 14, 32, 15, 1, 62, 14, 1, 32, 15, 1,
+ 62, 14, 1, 32, 15, 1, 62, 14, 1, 31,
+ 14, 46, 46, 78, 78, 78, 78, 16, 46, 15,
+ 1, 46, 15, 1, 1, 0, 0, 16, 1, 30,
+ 16, 0, 0, 94, 78, 62, 14, 78, 62, 46,
+ 15, 32, 32, 16, 1, 16, 46, 0, 78, 16,
+ 110, 62, 78, 16,
+ },
+
+ },
+
+ {
+ {
+ /* Context Tables for init_idc = 2, qp = 0 */
+
+ 14, 124, 17, 17, 65, 1, 78, 14, 14, 1,
+ 1, 62, 1, 17, 1, 1, 46, 30, 1, 14,
+ 81, 81, 81, 81, 81, 14, 14, 14, 62, 30,
+ 81, 124, 46, 1, 14, 81, 78, 33, 46, 1,
+ 17, 17, 49, 65, 33, 65, 81, 65, 49, 81,
+ 81, 81, 49, 65, 81, 81, 81, 33, 17, 49,
+ 49, 65, 33, 65, 81, 65, 49, 81, 81, 81,
+ 49, 65, 81, 81, 81, 33, 17, 49, 14, 33,
+ 49, 1, 1, 1, 17, 14, 17, 17, 17, 81,
+ 33, 62, 46, 33, 30, 14, 1, 62, 46, 33,
+ 30, 14, 1, 62, 46, 33, 30, 14, 1, 1,
+ 14, 1, 1, 1, 14, 1, 14, 46, 46, 46,
+ 33, 46, 46, 33, 1, 94, 46, 46, 1, 30,
+ 46, 62, 62, 62, 78, 30, 14, 14, 30, 1,
+ 14, 124, 62, 46, 1, 30, 46, 62, 17, 46,
+ 17, 17, 17, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 1 */
+
+ 14, 124, 15, 15, 63, 1, 78, 14, 14, 1,
+ 1, 64, 1, 15, 1, 1, 46, 30, 1, 16,
+ 77, 77, 77, 75, 75, 14, 14, 14, 62, 30,
+ 77, 124, 46, 0, 14, 79, 78, 29, 46, 1,
+ 15, 15, 47, 63, 31, 63, 77, 61, 47, 79,
+ 79, 77, 47, 63, 79, 79, 77, 31, 15, 45,
+ 47, 63, 31, 63, 77, 61, 47, 79, 79, 77,
+ 47, 63, 79, 79, 77, 31, 15, 45, 16, 31,
+ 45, 1, 1, 1, 15, 14, 15, 15, 15, 77,
+ 31, 62, 46, 31, 32, 14, 1, 62, 46, 31,
+ 32, 14, 1, 62, 46, 31, 32, 14, 1, 1,
+ 14, 0, 0, 0, 16, 0, 16, 46, 46, 46,
+ 31, 46, 46, 31, 1, 94, 46, 46, 1, 30,
+ 46, 62, 62, 64, 78, 32, 14, 16, 32, 0,
+ 14, 124, 62, 46, 1, 30, 46, 62, 15, 46,
+ 13, 15, 15, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 2 */
+
+ 14, 124, 13, 15, 61, 1, 76, 12, 12, 1,
+ 1, 64, 1, 15, 1, 1, 44, 30, 1, 16,
+ 75, 73, 73, 71, 71, 14, 14, 14, 60, 30,
+ 73, 124, 46, 2, 14, 77, 78, 27, 46, 1,
+ 15, 15, 45, 61, 29, 61, 75, 59, 45, 77,
+ 77, 73, 45, 61, 77, 77, 73, 29, 13, 43,
+ 45, 61, 29, 61, 75, 59, 45, 77, 77, 73,
+ 45, 61, 77, 77, 73, 29, 13, 43, 18, 31,
+ 41, 1, 1, 1, 15, 14, 15, 13, 13, 73,
+ 29, 62, 44, 31, 32, 14, 1, 62, 44, 31,
+ 32, 14, 1, 62, 44, 31, 32, 14, 1, 1,
+ 14, 0, 0, 2, 18, 2, 18, 46, 46, 44,
+ 31, 46, 44, 31, 1, 92, 46, 46, 1, 30,
+ 46, 60, 60, 64, 78, 32, 14, 18, 32, 2,
+ 14, 124, 62, 46, 1, 30, 46, 60, 13, 46,
+ 11, 13, 13, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 3 */
+
+ 14, 124, 11, 15, 61, 1, 74, 12, 10, 1,
+ 1, 64, 1, 15, 1, 1, 44, 30, 1, 16,
+ 73, 71, 69, 65, 65, 14, 14, 14, 58, 30,
+ 71, 124, 46, 2, 14, 75, 78, 25, 46, 1,
+ 15, 15, 45, 59, 29, 59, 73, 57, 45, 75,
+ 75, 71, 45, 61, 75, 75, 71, 27, 13, 41,
+ 45, 59, 29, 59, 73, 57, 45, 75, 75, 71,
+ 45, 61, 75, 75, 71, 27, 13, 41, 18, 31,
+ 37, 1, 1, 1, 15, 14, 15, 13, 13, 69,
+ 29, 62, 44, 31, 32, 14, 1, 62, 44, 31,
+ 32, 14, 1, 62, 44, 31, 32, 14, 1, 1,
+ 14, 0, 0, 2, 18, 2, 18, 46, 46, 44,
+ 31, 46, 44, 31, 1, 90, 46, 46, 1, 30,
+ 46, 60, 60, 64, 78, 32, 14, 18, 32, 2,
+ 14, 124, 62, 46, 1, 30, 46, 60, 11, 46,
+ 9, 11, 11, 46,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 4 */
+
+ 14, 124, 9, 13, 59, 1, 72, 10, 8, 1,
+ 1, 66, 1, 13, 1, 1, 42, 30, 1, 18,
+ 71, 67, 65, 61, 61, 14, 14, 12, 56, 28,
+ 67, 124, 44, 4, 14, 73, 78, 23, 44, 1,
+ 13, 13, 43, 57, 27, 57, 71, 55, 43, 73,
+ 73, 67, 43, 59, 73, 73, 67, 25, 11, 39,
+ 43, 57, 27, 57, 71, 55, 43, 73, 73, 67,
+ 43, 59, 73, 73, 67, 25, 11, 39, 20, 29,
+ 33, 1, 3, 1, 13, 14, 13, 11, 11, 65,
+ 27, 60, 42, 29, 34, 14, 1, 60, 42, 29,
+ 34, 14, 1, 60, 42, 29, 34, 14, 1, 3,
+ 14, 2, 2, 4, 20, 4, 20, 44, 46, 42,
+ 29, 46, 42, 29, 1, 88, 44, 44, 1, 30,
+ 44, 58, 58, 66, 78, 34, 14, 20, 34, 4,
+ 12, 124, 60, 44, 1, 30, 44, 58, 9, 44,
+ 7, 9, 9, 44,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 5 */
+
+ 14, 124, 7, 13, 57, 1, 70, 8, 6, 1,
+ 1, 66, 1, 13, 1, 1, 40, 30, 1, 18,
+ 67, 65, 61, 55, 55, 14, 14, 12, 54, 28,
+ 65, 124, 44, 6, 14, 71, 78, 19, 44, 1,
+ 13, 13, 41, 55, 25, 55, 67, 51, 41, 71,
+ 71, 65, 41, 57, 71, 71, 65, 23, 9, 35,
+ 41, 55, 25, 55, 67, 51, 41, 71, 71, 65,
+ 41, 57, 71, 71, 65, 23, 9, 35, 22, 29,
+ 29, 1, 3, 1, 13, 14, 13, 9, 9, 61,
+ 25, 60, 40, 29, 34, 14, 1, 60, 40, 29,
+ 34, 14, 1, 60, 40, 29, 34, 14, 1, 3,
+ 14, 2, 2, 6, 22, 6, 22, 44, 46, 40,
+ 29, 46, 40, 29, 1, 86, 44, 44, 1, 30,
+ 44, 56, 56, 66, 78, 34, 14, 22, 34, 6,
+ 12, 124, 60, 44, 1, 30, 44, 56, 7, 44,
+ 3, 7, 7, 44,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 6 */
+
+ 14, 124, 5, 13, 57, 1, 68, 8, 4, 1,
+ 1, 66, 1, 13, 1, 1, 40, 30, 1, 18,
+ 65, 61, 57, 51, 51, 14, 14, 12, 52, 28,
+ 61, 124, 44, 6, 14, 69, 78, 17, 44, 1,
+ 13, 13, 41, 53, 25, 53, 65, 49, 41, 69,
+ 69, 61, 41, 57, 69, 69, 61, 21, 9, 33,
+ 41, 53, 25, 53, 65, 49, 41, 69, 69, 61,
+ 41, 57, 69, 69, 61, 21, 9, 33, 22, 29,
+ 25, 1, 3, 1, 13, 14, 13, 9, 9, 57,
+ 25, 60, 40, 29, 34, 14, 1, 60, 40, 29,
+ 34, 14, 1, 60, 40, 29, 34, 14, 1, 3,
+ 14, 2, 2, 6, 22, 6, 22, 44, 46, 40,
+ 29, 46, 40, 29, 1, 84, 44, 44, 1, 30,
+ 44, 56, 56, 66, 78, 34, 14, 22, 34, 6,
+ 12, 124, 60, 44, 1, 30, 44, 56, 5, 44,
+ 1, 5, 5, 44,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 7 */
+
+ 14, 124, 3, 11, 55, 1, 66, 6, 2, 1,
+ 1, 68, 1, 11, 1, 1, 38, 30, 1, 20,
+ 63, 59, 53, 45, 45, 14, 14, 10, 50, 26,
+ 59, 124, 42, 8, 14, 67, 78, 15, 42, 1,
+ 11, 11, 39, 51, 23, 51, 63, 47, 39, 67,
+ 67, 59, 39, 55, 67, 67, 59, 19, 7, 31,
+ 39, 51, 23, 51, 63, 47, 39, 67, 67, 59,
+ 39, 55, 67, 67, 59, 19, 7, 31, 24, 27,
+ 21, 1, 5, 1, 11, 14, 11, 7, 7, 53,
+ 23, 58, 38, 27, 36, 14, 1, 58, 38, 27,
+ 36, 14, 1, 58, 38, 27, 36, 14, 1, 5,
+ 14, 4, 4, 8, 24, 8, 24, 42, 46, 38,
+ 27, 46, 38, 27, 1, 82, 42, 42, 1, 30,
+ 42, 54, 54, 68, 78, 36, 14, 24, 36, 8,
+ 10, 124, 58, 42, 1, 30, 42, 54, 3, 42,
+ 0, 3, 3, 42,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 8 */
+
+ 14, 124, 1, 11, 55, 1, 64, 4, 0, 1,
+ 1, 68, 1, 11, 1, 1, 36, 30, 1, 20,
+ 61, 55, 51, 41, 41, 14, 14, 10, 48, 26,
+ 55, 124, 42, 8, 14, 65, 78, 13, 42, 1,
+ 11, 11, 39, 49, 23, 49, 61, 45, 39, 65,
+ 65, 55, 39, 55, 65, 65, 55, 17, 7, 29,
+ 39, 49, 23, 49, 61, 45, 39, 65, 65, 55,
+ 39, 55, 65, 65, 55, 17, 7, 29, 24, 27,
+ 19, 1, 5, 1, 11, 14, 11, 7, 7, 51,
+ 23, 58, 36, 27, 36, 14, 1, 58, 36, 27,
+ 36, 14, 1, 58, 36, 27, 36, 14, 1, 5,
+ 14, 4, 4, 8, 24, 8, 24, 42, 46, 36,
+ 27, 46, 36, 27, 1, 80, 42, 42, 1, 30,
+ 42, 52, 52, 68, 78, 36, 14, 24, 36, 8,
+ 10, 124, 58, 42, 1, 30, 42, 52, 1, 42,
+ 2, 1, 1, 42,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 9 */
+
+ 14, 124, 0, 11, 53, 1, 62, 4, 1, 1,
+ 1, 68, 1, 11, 1, 1, 36, 30, 1, 20,
+ 57, 51, 47, 35, 35, 14, 14, 10, 46, 26,
+ 51, 124, 42, 10, 14, 63, 78, 9, 42, 1,
+ 11, 11, 37, 47, 21, 47, 57, 41, 37, 63,
+ 63, 51, 37, 53, 63, 63, 51, 15, 5, 25,
+ 37, 47, 21, 47, 57, 41, 37, 63, 63, 51,
+ 37, 53, 63, 63, 51, 15, 5, 25, 26, 27,
+ 15, 1, 5, 1, 11, 14, 11, 5, 5, 47,
+ 21, 58, 36, 27, 36, 14, 1, 58, 36, 27,
+ 36, 14, 1, 58, 36, 27, 36, 14, 1, 5,
+ 14, 4, 4, 10, 26, 10, 26, 42, 46, 36,
+ 27, 46, 36, 27, 1, 78, 42, 42, 1, 30,
+ 42, 52, 52, 68, 78, 36, 14, 26, 36, 10,
+ 10, 124, 58, 42, 1, 30, 42, 52, 0, 42,
+ 6, 0, 0, 42,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 10 */
+
+ 14, 124, 2, 9, 51, 1, 60, 2, 3, 1,
+ 1, 70, 1, 9, 1, 1, 34, 30, 1, 22,
+ 55, 49, 43, 31, 31, 14, 14, 8, 44, 24,
+ 49, 124, 40, 12, 14, 61, 78, 7, 40, 1,
+ 9, 9, 35, 45, 19, 45, 55, 39, 35, 61,
+ 61, 49, 35, 51, 61, 61, 49, 13, 3, 23,
+ 35, 45, 19, 45, 55, 39, 35, 61, 61, 49,
+ 35, 51, 61, 61, 49, 13, 3, 23, 28, 25,
+ 11, 1, 7, 1, 9, 14, 9, 3, 3, 43,
+ 19, 56, 34, 25, 38, 14, 1, 56, 34, 25,
+ 38, 14, 1, 56, 34, 25, 38, 14, 1, 7,
+ 14, 6, 6, 12, 28, 12, 28, 40, 46, 34,
+ 25, 46, 34, 25, 1, 76, 40, 40, 1, 30,
+ 40, 50, 50, 70, 78, 38, 14, 28, 38, 12,
+ 8, 124, 56, 40, 1, 30, 40, 50, 2, 40,
+ 8, 2, 2, 40,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 11 */
+
+ 14, 124, 4, 9, 51, 1, 58, 2, 5, 1,
+ 1, 70, 1, 9, 1, 1, 34, 30, 1, 22,
+ 53, 45, 39, 25, 25, 14, 14, 8, 42, 24,
+ 45, 124, 40, 12, 14, 59, 78, 5, 40, 1,
+ 9, 9, 35, 43, 19, 43, 53, 37, 35, 59,
+ 59, 45, 35, 51, 59, 59, 45, 11, 3, 21,
+ 35, 43, 19, 43, 53, 37, 35, 59, 59, 45,
+ 35, 51, 59, 59, 45, 11, 3, 21, 28, 25,
+ 7, 1, 7, 1, 9, 14, 9, 3, 3, 39,
+ 19, 56, 34, 25, 38, 14, 1, 56, 34, 25,
+ 38, 14, 1, 56, 34, 25, 38, 14, 1, 7,
+ 14, 6, 6, 12, 28, 12, 28, 40, 46, 34,
+ 25, 46, 34, 25, 1, 74, 40, 40, 1, 30,
+ 40, 50, 50, 70, 78, 38, 14, 28, 38, 12,
+ 8, 124, 56, 40, 1, 30, 40, 50, 4, 40,
+ 10, 4, 4, 40,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 12 */
+
+ 14, 124, 6, 9, 49, 1, 56, 0, 7, 1,
+ 1, 70, 1, 9, 1, 1, 32, 30, 1, 22,
+ 51, 43, 35, 21, 21, 14, 14, 8, 40, 24,
+ 43, 122, 40, 14, 14, 57, 78, 3, 40, 1,
+ 9, 9, 33, 41, 17, 41, 51, 35, 33, 57,
+ 57, 43, 33, 49, 57, 57, 43, 9, 1, 19,
+ 33, 41, 17, 41, 51, 35, 33, 57, 57, 43,
+ 33, 49, 57, 57, 43, 9, 1, 19, 30, 25,
+ 3, 1, 7, 1, 9, 14, 9, 1, 1, 35,
+ 17, 56, 32, 25, 38, 14, 1, 56, 32, 25,
+ 38, 14, 1, 56, 32, 25, 38, 14, 1, 7,
+ 14, 6, 6, 14, 30, 14, 30, 40, 46, 32,
+ 25, 46, 32, 25, 1, 72, 40, 40, 1, 30,
+ 40, 48, 48, 70, 78, 38, 14, 30, 38, 14,
+ 8, 124, 56, 40, 1, 30, 40, 48, 6, 40,
+ 12, 6, 6, 40,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 13 */
+
+ 14, 124, 8, 7, 47, 1, 54, 1, 9, 1,
+ 1, 72, 1, 7, 1, 1, 30, 30, 1, 24,
+ 47, 39, 31, 15, 15, 14, 14, 6, 38, 22,
+ 39, 118, 38, 16, 14, 55, 78, 0, 38, 1,
+ 7, 7, 31, 39, 15, 39, 47, 31, 31, 55,
+ 55, 39, 31, 47, 55, 55, 39, 7, 0, 15,
+ 31, 39, 15, 39, 47, 31, 31, 55, 55, 39,
+ 31, 47, 55, 55, 39, 7, 0, 15, 32, 23,
+ 0, 1, 9, 1, 7, 14, 7, 0, 0, 31,
+ 15, 54, 30, 23, 40, 14, 1, 54, 30, 23,
+ 40, 14, 1, 54, 30, 23, 40, 14, 1, 9,
+ 14, 8, 8, 16, 32, 16, 32, 38, 46, 30,
+ 23, 46, 30, 23, 1, 70, 38, 38, 1, 30,
+ 38, 46, 46, 72, 78, 40, 14, 32, 40, 16,
+ 6, 124, 54, 38, 1, 30, 38, 46, 8, 38,
+ 16, 8, 8, 38,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 14 */
+
+ 14, 124, 10, 7, 47, 1, 52, 1, 11, 1,
+ 1, 72, 1, 7, 1, 1, 30, 30, 1, 24,
+ 45, 37, 27, 11, 11, 14, 14, 6, 36, 22,
+ 37, 116, 38, 16, 14, 53, 78, 2, 38, 1,
+ 7, 7, 31, 37, 15, 37, 45, 29, 31, 53,
+ 53, 37, 31, 47, 53, 53, 37, 5, 0, 13,
+ 31, 37, 15, 37, 45, 29, 31, 53, 53, 37,
+ 31, 47, 53, 53, 37, 5, 0, 13, 32, 23,
+ 4, 1, 9, 1, 7, 14, 7, 0, 0, 27,
+ 15, 54, 30, 23, 40, 14, 1, 54, 30, 23,
+ 40, 14, 1, 54, 30, 23, 40, 14, 1, 9,
+ 14, 8, 8, 16, 32, 16, 32, 38, 46, 30,
+ 23, 46, 30, 23, 1, 68, 38, 38, 1, 30,
+ 38, 46, 46, 72, 78, 40, 14, 32, 40, 16,
+ 6, 124, 54, 38, 1, 30, 38, 46, 10, 38,
+ 18, 10, 10, 38,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 15 */
+
+ 14, 124, 12, 7, 45, 1, 50, 3, 13, 1,
+ 1, 72, 1, 7, 1, 1, 28, 30, 1, 24,
+ 43, 33, 23, 5, 5, 14, 14, 6, 34, 22,
+ 33, 112, 38, 18, 14, 51, 78, 4, 38, 1,
+ 7, 7, 29, 35, 13, 35, 43, 27, 29, 51,
+ 51, 33, 29, 45, 51, 51, 33, 3, 2, 11,
+ 29, 35, 13, 35, 43, 27, 29, 51, 51, 33,
+ 29, 45, 51, 51, 33, 3, 2, 11, 34, 23,
+ 8, 1, 9, 1, 7, 14, 7, 2, 2, 23,
+ 13, 54, 28, 23, 40, 14, 1, 54, 28, 23,
+ 40, 14, 1, 54, 28, 23, 40, 14, 1, 9,
+ 14, 8, 8, 18, 34, 18, 34, 38, 46, 28,
+ 23, 46, 28, 23, 1, 66, 38, 38, 1, 30,
+ 38, 44, 44, 72, 78, 40, 14, 34, 40, 18,
+ 6, 122, 54, 38, 1, 30, 38, 44, 12, 38,
+ 20, 12, 12, 38,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 16 */
+
+ 14, 124, 12, 7, 45, 1, 48, 5, 15, 1,
+ 1, 72, 1, 7, 1, 1, 26, 30, 1, 24,
+ 41, 31, 21, 1, 1, 14, 14, 4, 32, 20,
+ 31, 108, 36, 18, 14, 51, 78, 6, 36, 1,
+ 7, 7, 29, 35, 13, 35, 41, 25, 29, 51,
+ 51, 31, 29, 45, 51, 51, 31, 3, 2, 9,
+ 29, 35, 13, 35, 41, 25, 29, 51, 51, 31,
+ 29, 45, 51, 51, 31, 3, 2, 9, 34, 23,
+ 10, 1, 11, 1, 7, 14, 7, 2, 2, 21,
+ 13, 52, 26, 23, 40, 14, 1, 52, 26, 23,
+ 40, 14, 1, 52, 26, 23, 40, 14, 1, 11,
+ 14, 8, 8, 18, 34, 18, 34, 36, 46, 26,
+ 23, 46, 26, 23, 1, 64, 36, 36, 1, 30,
+ 36, 42, 42, 72, 78, 40, 14, 34, 40, 18,
+ 4, 118, 52, 36, 1, 30, 36, 42, 12, 36,
+ 22, 12, 12, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 17 */
+
+ 14, 124, 14, 5, 43, 1, 48, 5, 15, 1,
+ 1, 74, 1, 5, 1, 1, 26, 30, 1, 26,
+ 37, 27, 17, 4, 4, 14, 14, 4, 32, 20,
+ 27, 106, 36, 20, 14, 49, 78, 10, 36, 1,
+ 5, 5, 27, 33, 11, 33, 37, 21, 27, 49,
+ 49, 27, 27, 43, 49, 49, 27, 1, 4, 5,
+ 27, 33, 11, 33, 37, 21, 27, 49, 49, 27,
+ 27, 43, 49, 49, 27, 1, 4, 5, 36, 21,
+ 14, 1, 11, 1, 5, 14, 5, 4, 4, 17,
+ 11, 52, 26, 21, 42, 14, 1, 52, 26, 21,
+ 42, 14, 1, 52, 26, 21, 42, 14, 1, 11,
+ 14, 10, 10, 20, 36, 20, 36, 36, 46, 26,
+ 21, 46, 26, 21, 1, 64, 36, 36, 1, 30,
+ 36, 42, 42, 74, 78, 42, 14, 36, 42, 20,
+ 4, 116, 52, 36, 1, 30, 36, 42, 14, 36,
+ 26, 14, 14, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 18 */
+
+ 14, 124, 16, 5, 41, 1, 46, 7, 17, 1,
+ 1, 74, 1, 5, 1, 1, 24, 30, 1, 26,
+ 35, 23, 13, 8, 8, 14, 14, 4, 30, 20,
+ 23, 102, 36, 22, 14, 47, 78, 12, 36, 1,
+ 5, 5, 25, 31, 9, 31, 35, 19, 25, 47,
+ 47, 23, 25, 41, 47, 47, 23, 0, 6, 3,
+ 25, 31, 9, 31, 35, 19, 25, 47, 47, 23,
+ 25, 41, 47, 47, 23, 0, 6, 3, 38, 21,
+ 18, 1, 11, 1, 5, 14, 5, 6, 6, 13,
+ 9, 52, 24, 21, 42, 14, 1, 52, 24, 21,
+ 42, 14, 1, 52, 24, 21, 42, 14, 1, 11,
+ 14, 10, 10, 22, 38, 22, 38, 36, 46, 24,
+ 21, 46, 24, 21, 1, 62, 36, 36, 1, 30,
+ 36, 40, 40, 74, 78, 42, 14, 38, 42, 22,
+ 4, 114, 52, 36, 1, 30, 36, 40, 16, 36,
+ 28, 16, 16, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 19 */
+
+ 14, 124, 18, 5, 41, 1, 44, 7, 19, 1,
+ 1, 74, 1, 5, 1, 1, 24, 30, 1, 26,
+ 33, 21, 9, 14, 14, 14, 14, 4, 28, 20,
+ 21, 100, 36, 22, 14, 45, 78, 14, 36, 1,
+ 5, 5, 25, 29, 9, 29, 33, 17, 25, 45,
+ 45, 21, 25, 41, 45, 45, 21, 2, 6, 1,
+ 25, 29, 9, 29, 33, 17, 25, 45, 45, 21,
+ 25, 41, 45, 45, 21, 2, 6, 1, 38, 21,
+ 22, 1, 11, 1, 5, 14, 5, 6, 6, 9,
+ 9, 52, 24, 21, 42, 14, 1, 52, 24, 21,
+ 42, 14, 1, 52, 24, 21, 42, 14, 1, 11,
+ 14, 10, 10, 22, 38, 22, 38, 36, 46, 24,
+ 21, 46, 24, 21, 1, 60, 36, 36, 1, 30,
+ 36, 40, 40, 74, 78, 42, 14, 38, 42, 22,
+ 4, 112, 52, 36, 1, 30, 36, 40, 18, 36,
+ 30, 18, 18, 36,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 20 */
+
+ 14, 124, 20, 3, 39, 1, 42, 9, 21, 1,
+ 1, 76, 1, 3, 1, 1, 22, 30, 1, 28,
+ 31, 17, 5, 18, 18, 14, 14, 2, 26, 18,
+ 17, 96, 34, 24, 14, 43, 78, 16, 34, 1,
+ 3, 3, 23, 27, 7, 27, 31, 15, 23, 43,
+ 43, 17, 23, 39, 43, 43, 17, 4, 8, 0,
+ 23, 27, 7, 27, 31, 15, 23, 43, 43, 17,
+ 23, 39, 43, 43, 17, 4, 8, 0, 40, 19,
+ 26, 1, 13, 1, 3, 14, 3, 8, 8, 5,
+ 7, 50, 22, 19, 44, 14, 1, 50, 22, 19,
+ 44, 14, 1, 50, 22, 19, 44, 14, 1, 13,
+ 14, 12, 12, 24, 40, 24, 40, 34, 46, 22,
+ 19, 46, 22, 19, 1, 58, 34, 34, 1, 30,
+ 34, 38, 38, 76, 78, 44, 14, 40, 44, 24,
+ 2, 108, 50, 34, 1, 30, 34, 38, 20, 34,
+ 32, 20, 20, 34,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 21 */
+
+ 14, 124, 22, 3, 37, 1, 40, 11, 23, 1,
+ 1, 76, 1, 3, 1, 1, 20, 30, 1, 28,
+ 27, 15, 1, 24, 24, 14, 14, 2, 24, 18,
+ 15, 94, 34, 26, 14, 41, 78, 20, 34, 1,
+ 3, 3, 21, 25, 5, 25, 27, 11, 21, 41,
+ 41, 15, 21, 37, 41, 41, 15, 6, 10, 4,
+ 21, 25, 5, 25, 27, 11, 21, 41, 41, 15,
+ 21, 37, 41, 41, 15, 6, 10, 4, 42, 19,
+ 30, 1, 13, 1, 3, 14, 3, 10, 10, 1,
+ 5, 50, 20, 19, 44, 14, 1, 50, 20, 19,
+ 44, 14, 1, 50, 20, 19, 44, 14, 1, 13,
+ 14, 12, 12, 26, 42, 26, 42, 34, 46, 20,
+ 19, 46, 20, 19, 1, 56, 34, 34, 1, 30,
+ 34, 36, 36, 76, 78, 44, 14, 42, 44, 26,
+ 2, 106, 50, 34, 1, 30, 34, 36, 22, 34,
+ 36, 22, 22, 34,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 22 */
+
+ 14, 124, 24, 3, 37, 1, 38, 11, 25, 1,
+ 1, 76, 1, 3, 1, 1, 20, 30, 1, 28,
+ 25, 11, 2, 28, 28, 14, 14, 2, 22, 18,
+ 11, 90, 34, 26, 14, 39, 78, 22, 34, 1,
+ 3, 3, 21, 23, 5, 23, 25, 9, 21, 39,
+ 39, 11, 21, 37, 39, 39, 11, 8, 10, 6,
+ 21, 23, 5, 23, 25, 9, 21, 39, 39, 11,
+ 21, 37, 39, 39, 11, 8, 10, 6, 42, 19,
+ 34, 1, 13, 1, 3, 14, 3, 10, 10, 2,
+ 5, 50, 20, 19, 44, 14, 1, 50, 20, 19,
+ 44, 14, 1, 50, 20, 19, 44, 14, 1, 13,
+ 14, 12, 12, 26, 42, 26, 42, 34, 46, 20,
+ 19, 46, 20, 19, 1, 54, 34, 34, 1, 30,
+ 34, 36, 36, 76, 78, 44, 14, 42, 44, 26,
+ 2, 104, 50, 34, 1, 30, 34, 36, 24, 34,
+ 38, 24, 24, 34,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 23 */
+
+ 14, 124, 26, 1, 35, 1, 36, 13, 27, 1,
+ 1, 78, 1, 1, 1, 1, 18, 30, 1, 30,
+ 23, 9, 6, 34, 34, 14, 14, 0, 20, 16,
+ 9, 88, 32, 28, 14, 37, 78, 24, 32, 1,
+ 1, 1, 19, 21, 3, 21, 23, 7, 19, 37,
+ 37, 9, 19, 35, 37, 37, 9, 10, 12, 8,
+ 19, 21, 3, 21, 23, 7, 19, 37, 37, 9,
+ 19, 35, 37, 37, 9, 10, 12, 8, 44, 17,
+ 38, 1, 15, 1, 1, 14, 1, 12, 12, 6,
+ 3, 48, 18, 17, 46, 14, 1, 48, 18, 17,
+ 46, 14, 1, 48, 18, 17, 46, 14, 1, 15,
+ 14, 14, 14, 28, 44, 28, 44, 32, 46, 18,
+ 17, 46, 18, 17, 1, 52, 32, 32, 1, 30,
+ 32, 34, 34, 78, 78, 46, 14, 44, 46, 28,
+ 0, 102, 48, 32, 1, 30, 32, 34, 26, 32,
+ 40, 26, 26, 32,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 24 */
+
+ 14, 124, 28, 1, 35, 1, 34, 15, 29, 1,
+ 1, 78, 1, 1, 1, 1, 16, 30, 1, 30,
+ 21, 5, 8, 38, 38, 14, 14, 0, 18, 16,
+ 5, 84, 32, 28, 14, 35, 78, 26, 32, 1,
+ 1, 1, 19, 19, 3, 19, 21, 5, 19, 35,
+ 35, 5, 19, 35, 35, 35, 5, 12, 12, 10,
+ 19, 19, 3, 19, 21, 5, 19, 35, 35, 5,
+ 19, 35, 35, 35, 5, 12, 12, 10, 44, 17,
+ 40, 1, 15, 1, 1, 14, 1, 12, 12, 8,
+ 3, 48, 16, 17, 46, 14, 1, 48, 16, 17,
+ 46, 14, 1, 48, 16, 17, 46, 14, 1, 15,
+ 14, 14, 14, 28, 44, 28, 44, 32, 46, 16,
+ 17, 46, 16, 17, 1, 50, 32, 32, 1, 30,
+ 32, 32, 32, 78, 78, 46, 14, 44, 46, 28,
+ 0, 98, 48, 32, 1, 30, 32, 32, 28, 32,
+ 42, 28, 28, 32,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 25 */
+
+ 14, 124, 30, 1, 33, 1, 32, 15, 31, 1,
+ 1, 78, 1, 1, 1, 1, 16, 30, 1, 30,
+ 17, 1, 12, 44, 44, 14, 14, 0, 16, 16,
+ 1, 80, 32, 30, 14, 33, 78, 30, 32, 1,
+ 1, 1, 17, 17, 1, 17, 17, 1, 17, 33,
+ 33, 1, 17, 33, 33, 33, 1, 14, 14, 14,
+ 17, 17, 1, 17, 17, 1, 17, 33, 33, 1,
+ 17, 33, 33, 33, 1, 14, 14, 14, 46, 17,
+ 44, 1, 15, 1, 1, 14, 1, 14, 14, 12,
+ 1, 48, 16, 17, 46, 14, 1, 48, 16, 17,
+ 46, 14, 1, 48, 16, 17, 46, 14, 1, 15,
+ 14, 14, 14, 30, 46, 30, 46, 32, 46, 16,
+ 17, 46, 16, 17, 1, 48, 32, 32, 1, 30,
+ 32, 32, 32, 78, 78, 46, 14, 46, 46, 30,
+ 0, 96, 48, 32, 1, 30, 32, 32, 30, 32,
+ 46, 30, 30, 32,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 26 */
+
+ 14, 124, 32, 0, 31, 1, 30, 17, 33, 1,
+ 1, 80, 1, 0, 1, 1, 14, 30, 1, 32,
+ 15, 0, 16, 48, 48, 14, 14, 1, 14, 14,
+ 0, 78, 30, 32, 14, 31, 78, 32, 30, 1,
+ 0, 0, 15, 15, 0, 15, 15, 0, 15, 31,
+ 31, 0, 15, 31, 31, 31, 0, 16, 16, 16,
+ 15, 15, 0, 15, 15, 0, 15, 31, 31, 0,
+ 15, 31, 31, 31, 0, 16, 16, 16, 48, 15,
+ 48, 1, 17, 1, 0, 14, 0, 16, 16, 16,
+ 0, 46, 14, 15, 48, 14, 1, 46, 14, 15,
+ 48, 14, 1, 46, 14, 15, 48, 14, 1, 17,
+ 14, 16, 16, 32, 48, 32, 48, 30, 46, 14,
+ 15, 46, 14, 15, 1, 46, 30, 30, 1, 30,
+ 30, 30, 30, 80, 78, 48, 14, 48, 48, 32,
+ 1, 94, 46, 30, 1, 30, 30, 30, 32, 30,
+ 48, 32, 32, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 27 */
+
+ 14, 124, 34, 0, 31, 1, 28, 17, 35, 1,
+ 1, 80, 1, 0, 1, 1, 14, 30, 1, 32,
+ 13, 4, 20, 54, 54, 14, 14, 1, 12, 14,
+ 4, 74, 30, 32, 14, 29, 78, 34, 30, 1,
+ 0, 0, 15, 13, 0, 13, 13, 2, 15, 29,
+ 29, 4, 15, 31, 29, 29, 4, 18, 16, 18,
+ 15, 13, 0, 13, 13, 2, 15, 29, 29, 4,
+ 15, 31, 29, 29, 4, 18, 16, 18, 48, 15,
+ 52, 1, 17, 1, 0, 14, 0, 16, 16, 20,
+ 0, 46, 14, 15, 48, 14, 1, 46, 14, 15,
+ 48, 14, 1, 46, 14, 15, 48, 14, 1, 17,
+ 14, 16, 16, 32, 48, 32, 48, 30, 46, 14,
+ 15, 46, 14, 15, 1, 44, 30, 30, 1, 30,
+ 30, 30, 30, 80, 78, 48, 14, 48, 48, 32,
+ 1, 92, 46, 30, 1, 30, 30, 30, 34, 30,
+ 50, 34, 34, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 28 */
+
+ 14, 124, 36, 0, 29, 1, 26, 19, 37, 1,
+ 1, 80, 1, 0, 1, 1, 12, 30, 1, 32,
+ 11, 6, 24, 58, 58, 14, 14, 1, 10, 14,
+ 6, 72, 30, 34, 14, 27, 78, 36, 30, 1,
+ 0, 0, 13, 11, 2, 11, 11, 4, 13, 27,
+ 27, 6, 13, 29, 27, 27, 6, 20, 18, 20,
+ 13, 11, 2, 11, 11, 4, 13, 27, 27, 6,
+ 13, 29, 27, 27, 6, 20, 18, 20, 50, 15,
+ 56, 1, 17, 1, 0, 14, 0, 18, 18, 24,
+ 2, 46, 12, 15, 48, 14, 1, 46, 12, 15,
+ 48, 14, 1, 46, 12, 15, 48, 14, 1, 17,
+ 14, 16, 16, 34, 50, 34, 50, 30, 46, 12,
+ 15, 46, 12, 15, 1, 42, 30, 30, 1, 30,
+ 30, 28, 28, 80, 78, 48, 14, 50, 48, 34,
+ 1, 88, 46, 30, 1, 30, 30, 28, 36, 30,
+ 52, 36, 36, 30,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 29 */
+
+ 14, 124, 38, 2, 27, 1, 24, 21, 39, 1,
+ 1, 82, 1, 2, 1, 1, 10, 30, 1, 34,
+ 7, 10, 28, 64, 64, 14, 14, 3, 8, 12,
+ 10, 68, 28, 36, 14, 25, 78, 40, 28, 1,
+ 2, 2, 11, 9, 4, 9, 7, 8, 11, 25,
+ 25, 10, 11, 27, 25, 25, 10, 22, 20, 24,
+ 11, 9, 4, 9, 7, 8, 11, 25, 25, 10,
+ 11, 27, 25, 25, 10, 22, 20, 24, 52, 13,
+ 60, 1, 19, 1, 2, 14, 2, 20, 20, 28,
+ 4, 44, 10, 13, 50, 14, 1, 44, 10, 13,
+ 50, 14, 1, 44, 10, 13, 50, 14, 1, 19,
+ 14, 18, 18, 36, 52, 36, 52, 28, 46, 10,
+ 13, 46, 10, 13, 1, 40, 28, 28, 1, 30,
+ 28, 26, 26, 82, 78, 50, 14, 52, 50, 36,
+ 3, 86, 44, 28, 1, 30, 28, 26, 38, 28,
+ 56, 38, 38, 28,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 30 */
+
+ 14, 124, 40, 2, 27, 1, 22, 21, 41, 1,
+ 1, 82, 1, 2, 1, 1, 10, 30, 1, 34,
+ 5, 12, 32, 68, 68, 14, 14, 3, 6, 12,
+ 12, 66, 28, 36, 14, 23, 78, 42, 28, 1,
+ 2, 2, 11, 7, 4, 7, 5, 10, 11, 23,
+ 23, 12, 11, 27, 23, 23, 12, 24, 20, 26,
+ 11, 7, 4, 7, 5, 10, 11, 23, 23, 12,
+ 11, 27, 23, 23, 12, 24, 20, 26, 52, 13,
+ 64, 1, 19, 1, 2, 14, 2, 20, 20, 32,
+ 4, 44, 10, 13, 50, 14, 1, 44, 10, 13,
+ 50, 14, 1, 44, 10, 13, 50, 14, 1, 19,
+ 14, 18, 18, 36, 52, 36, 52, 28, 46, 10,
+ 13, 46, 10, 13, 1, 38, 28, 28, 1, 30,
+ 28, 26, 26, 82, 78, 50, 14, 52, 50, 36,
+ 3, 84, 44, 28, 1, 30, 28, 26, 40, 28,
+ 58, 40, 40, 28,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 31 */
+
+ 14, 124, 42, 2, 25, 1, 20, 23, 43, 1,
+ 1, 82, 1, 2, 1, 1, 8, 30, 1, 34,
+ 3, 16, 36, 74, 74, 14, 14, 3, 4, 12,
+ 16, 62, 28, 38, 14, 21, 78, 44, 28, 1,
+ 2, 2, 9, 5, 6, 5, 3, 12, 9, 21,
+ 21, 16, 9, 25, 21, 21, 16, 26, 22, 28,
+ 9, 5, 6, 5, 3, 12, 9, 21, 21, 16,
+ 9, 25, 21, 21, 16, 26, 22, 28, 54, 13,
+ 68, 1, 19, 1, 2, 14, 2, 22, 22, 36,
+ 6, 44, 8, 13, 50, 14, 1, 44, 8, 13,
+ 50, 14, 1, 44, 8, 13, 50, 14, 1, 19,
+ 14, 18, 18, 38, 54, 38, 54, 28, 46, 8,
+ 13, 46, 8, 13, 1, 36, 28, 28, 1, 30,
+ 28, 24, 24, 82, 78, 50, 14, 54, 50, 38,
+ 3, 82, 44, 28, 1, 30, 28, 24, 42, 28,
+ 60, 42, 42, 28,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 32 */
+
+ 14, 124, 42, 2, 25, 1, 18, 25, 45, 1,
+ 1, 82, 1, 2, 1, 1, 6, 30, 1, 34,
+ 1, 18, 38, 78, 78, 14, 14, 5, 2, 10,
+ 18, 58, 26, 38, 14, 21, 78, 46, 26, 1,
+ 2, 2, 9, 5, 6, 5, 1, 14, 9, 21,
+ 21, 18, 9, 25, 21, 21, 18, 26, 22, 30,
+ 9, 5, 6, 5, 1, 14, 9, 21, 21, 18,
+ 9, 25, 21, 21, 18, 26, 22, 30, 54, 13,
+ 70, 1, 21, 1, 2, 14, 2, 22, 22, 38,
+ 6, 42, 6, 13, 50, 14, 1, 42, 6, 13,
+ 50, 14, 1, 42, 6, 13, 50, 14, 1, 21,
+ 14, 18, 18, 38, 54, 38, 54, 26, 46, 6,
+ 13, 46, 6, 13, 1, 34, 26, 26, 1, 30,
+ 26, 22, 22, 82, 78, 50, 14, 54, 50, 38,
+ 5, 78, 42, 26, 1, 30, 26, 22, 42, 26,
+ 62, 42, 42, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 33 */
+
+ 14, 124, 44, 4, 23, 1, 18, 25, 45, 1,
+ 1, 84, 1, 4, 1, 1, 6, 30, 1, 36,
+ 2, 22, 42, 84, 84, 14, 14, 5, 2, 10,
+ 22, 56, 26, 40, 14, 19, 78, 50, 26, 1,
+ 4, 4, 7, 3, 8, 3, 2, 18, 7, 19,
+ 19, 22, 7, 23, 19, 19, 22, 28, 24, 34,
+ 7, 3, 8, 3, 2, 18, 7, 19, 19, 22,
+ 7, 23, 19, 19, 22, 28, 24, 34, 56, 11,
+ 74, 1, 21, 1, 4, 14, 4, 24, 24, 42,
+ 8, 42, 6, 11, 52, 14, 1, 42, 6, 11,
+ 52, 14, 1, 42, 6, 11, 52, 14, 1, 21,
+ 14, 20, 20, 40, 56, 40, 56, 26, 46, 6,
+ 11, 46, 6, 11, 1, 34, 26, 26, 1, 30,
+ 26, 22, 22, 84, 78, 52, 14, 56, 52, 40,
+ 5, 76, 42, 26, 1, 30, 26, 22, 44, 26,
+ 66, 44, 44, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 34 */
+
+ 14, 124, 46, 4, 21, 1, 16, 27, 47, 1,
+ 1, 84, 1, 4, 1, 1, 4, 30, 1, 36,
+ 4, 26, 46, 88, 88, 14, 14, 5, 0, 10,
+ 26, 52, 26, 42, 14, 17, 78, 52, 26, 1,
+ 4, 4, 5, 1, 10, 1, 4, 20, 5, 17,
+ 17, 26, 5, 21, 17, 17, 26, 30, 26, 36,
+ 5, 1, 10, 1, 4, 20, 5, 17, 17, 26,
+ 5, 21, 17, 17, 26, 30, 26, 36, 58, 11,
+ 78, 1, 21, 1, 4, 14, 4, 26, 26, 46,
+ 10, 42, 4, 11, 52, 14, 1, 42, 4, 11,
+ 52, 14, 1, 42, 4, 11, 52, 14, 1, 21,
+ 14, 20, 20, 42, 58, 42, 58, 26, 46, 4,
+ 11, 46, 4, 11, 1, 32, 26, 26, 1, 30,
+ 26, 20, 20, 84, 78, 52, 14, 58, 52, 42,
+ 5, 74, 42, 26, 1, 30, 26, 20, 46, 26,
+ 68, 46, 46, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 35 */
+
+ 14, 124, 48, 4, 21, 1, 14, 27, 49, 1,
+ 1, 84, 1, 4, 1, 1, 4, 30, 1, 36,
+ 6, 28, 50, 94, 94, 14, 14, 5, 1, 10,
+ 28, 50, 26, 42, 14, 15, 78, 54, 26, 1,
+ 4, 4, 5, 0, 10, 0, 6, 22, 5, 15,
+ 15, 28, 5, 21, 15, 15, 28, 32, 26, 38,
+ 5, 0, 10, 0, 6, 22, 5, 15, 15, 28,
+ 5, 21, 15, 15, 28, 32, 26, 38, 58, 11,
+ 82, 1, 21, 1, 4, 14, 4, 26, 26, 50,
+ 10, 42, 4, 11, 52, 14, 1, 42, 4, 11,
+ 52, 14, 1, 42, 4, 11, 52, 14, 1, 21,
+ 14, 20, 20, 42, 58, 42, 58, 26, 46, 4,
+ 11, 46, 4, 11, 1, 30, 26, 26, 1, 30,
+ 26, 20, 20, 84, 78, 52, 14, 58, 52, 42,
+ 5, 72, 42, 26, 1, 30, 26, 20, 48, 26,
+ 70, 48, 48, 26,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 36 */
+
+ 14, 124, 50, 6, 19, 1, 12, 29, 51, 1,
+ 1, 86, 1, 6, 1, 1, 2, 30, 1, 38,
+ 8, 32, 54, 98, 98, 14, 14, 7, 3, 8,
+ 32, 46, 24, 44, 14, 13, 78, 56, 24, 1,
+ 6, 6, 3, 2, 12, 2, 8, 24, 3, 13,
+ 13, 32, 3, 19, 13, 13, 32, 34, 28, 40,
+ 3, 2, 12, 2, 8, 24, 3, 13, 13, 32,
+ 3, 19, 13, 13, 32, 34, 28, 40, 60, 9,
+ 86, 1, 23, 1, 6, 14, 6, 28, 28, 54,
+ 12, 40, 2, 9, 54, 14, 1, 40, 2, 9,
+ 54, 14, 1, 40, 2, 9, 54, 14, 1, 23,
+ 14, 22, 22, 44, 60, 44, 60, 24, 46, 2,
+ 9, 46, 2, 9, 1, 28, 24, 24, 1, 30,
+ 24, 18, 18, 86, 78, 54, 14, 60, 54, 44,
+ 7, 68, 40, 24, 1, 30, 24, 18, 50, 24,
+ 72, 50, 50, 24,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 37 */
+
+ 14, 124, 52, 6, 17, 1, 10, 31, 53, 1,
+ 1, 86, 1, 6, 1, 1, 0, 30, 1, 38,
+ 12, 34, 58, 104, 104, 14, 14, 7, 5, 8,
+ 34, 44, 24, 46, 14, 11, 78, 60, 24, 1,
+ 6, 6, 1, 4, 14, 4, 12, 28, 1, 11,
+ 11, 34, 1, 17, 11, 11, 34, 36, 30, 44,
+ 1, 4, 14, 4, 12, 28, 1, 11, 11, 34,
+ 1, 17, 11, 11, 34, 36, 30, 44, 62, 9,
+ 90, 1, 23, 1, 6, 14, 6, 30, 30, 58,
+ 14, 40, 0, 9, 54, 14, 1, 40, 0, 9,
+ 54, 14, 1, 40, 0, 9, 54, 14, 1, 23,
+ 14, 22, 22, 46, 62, 46, 62, 24, 46, 0,
+ 9, 46, 0, 9, 1, 26, 24, 24, 1, 30,
+ 24, 16, 16, 86, 78, 54, 14, 62, 54, 46,
+ 7, 66, 40, 24, 1, 30, 24, 16, 52, 24,
+ 76, 52, 52, 24,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 38 */
+
+ 14, 124, 54, 6, 17, 1, 8, 31, 55, 1,
+ 1, 86, 1, 6, 1, 1, 0, 30, 1, 38,
+ 14, 38, 62, 108, 108, 14, 14, 7, 7, 8,
+ 38, 40, 24, 46, 14, 9, 78, 62, 24, 1,
+ 6, 6, 1, 6, 14, 6, 14, 30, 1, 9,
+ 9, 38, 1, 17, 9, 9, 38, 38, 30, 46,
+ 1, 6, 14, 6, 14, 30, 1, 9, 9, 38,
+ 1, 17, 9, 9, 38, 38, 30, 46, 62, 9,
+ 94, 1, 23, 1, 6, 14, 6, 30, 30, 62,
+ 14, 40, 0, 9, 54, 14, 1, 40, 0, 9,
+ 54, 14, 1, 40, 0, 9, 54, 14, 1, 23,
+ 14, 22, 22, 46, 62, 46, 62, 24, 46, 0,
+ 9, 46, 0, 9, 1, 24, 24, 24, 1, 30,
+ 24, 16, 16, 86, 78, 54, 14, 62, 54, 46,
+ 7, 64, 40, 24, 1, 30, 24, 16, 54, 24,
+ 78, 54, 54, 24,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 39 */
+
+ 14, 124, 56, 8, 15, 1, 6, 33, 57, 1,
+ 1, 88, 1, 8, 1, 1, 1, 30, 1, 40,
+ 16, 40, 66, 114, 114, 14, 14, 9, 9, 6,
+ 40, 38, 22, 48, 14, 7, 78, 64, 22, 1,
+ 8, 8, 0, 8, 16, 8, 16, 32, 0, 7,
+ 7, 40, 0, 15, 7, 7, 40, 40, 32, 48,
+ 0, 8, 16, 8, 16, 32, 0, 7, 7, 40,
+ 0, 15, 7, 7, 40, 40, 32, 48, 64, 7,
+ 98, 1, 25, 1, 8, 14, 8, 32, 32, 66,
+ 16, 38, 1, 7, 56, 14, 1, 38, 1, 7,
+ 56, 14, 1, 38, 1, 7, 56, 14, 1, 25,
+ 14, 24, 24, 48, 64, 48, 64, 22, 46, 1,
+ 7, 46, 1, 7, 1, 22, 22, 22, 1, 30,
+ 22, 14, 14, 88, 78, 56, 14, 64, 56, 48,
+ 9, 62, 38, 22, 1, 30, 22, 14, 56, 22,
+ 80, 56, 56, 22,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 40 */
+
+ 14, 124, 58, 8, 15, 1, 4, 35, 59, 1,
+ 1, 88, 1, 8, 1, 1, 3, 30, 1, 40,
+ 18, 44, 68, 118, 118, 14, 14, 9, 11, 6,
+ 44, 34, 22, 48, 14, 5, 78, 66, 22, 1,
+ 8, 8, 0, 10, 16, 10, 18, 34, 0, 5,
+ 5, 44, 0, 15, 5, 5, 44, 42, 32, 50,
+ 0, 10, 16, 10, 18, 34, 0, 5, 5, 44,
+ 0, 15, 5, 5, 44, 42, 32, 50, 64, 7,
+ 100, 1, 25, 1, 8, 14, 8, 32, 32, 68,
+ 16, 38, 3, 7, 56, 14, 1, 38, 3, 7,
+ 56, 14, 1, 38, 3, 7, 56, 14, 1, 25,
+ 14, 24, 24, 48, 64, 48, 64, 22, 46, 3,
+ 7, 46, 3, 7, 1, 20, 22, 22, 1, 30,
+ 22, 12, 12, 88, 78, 56, 14, 64, 56, 48,
+ 9, 58, 38, 22, 1, 30, 22, 12, 58, 22,
+ 82, 58, 58, 22,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 41 */
+
+ 14, 124, 60, 8, 13, 1, 2, 35, 61, 1,
+ 1, 88, 1, 8, 1, 1, 3, 30, 1, 40,
+ 22, 48, 72, 124, 124, 14, 14, 9, 13, 6,
+ 48, 30, 22, 50, 14, 3, 78, 70, 22, 1,
+ 8, 8, 2, 12, 18, 12, 22, 38, 2, 3,
+ 3, 48, 2, 13, 3, 3, 48, 44, 34, 54,
+ 2, 12, 18, 12, 22, 38, 2, 3, 3, 48,
+ 2, 13, 3, 3, 48, 44, 34, 54, 66, 7,
+ 104, 1, 25, 1, 8, 14, 8, 34, 34, 72,
+ 18, 38, 3, 7, 56, 14, 1, 38, 3, 7,
+ 56, 14, 1, 38, 3, 7, 56, 14, 1, 25,
+ 14, 24, 24, 50, 66, 50, 66, 22, 46, 3,
+ 7, 46, 3, 7, 1, 18, 22, 22, 1, 30,
+ 22, 12, 12, 88, 78, 56, 14, 66, 56, 50,
+ 9, 56, 38, 22, 1, 30, 22, 12, 60, 22,
+ 86, 60, 60, 22,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 42 */
+
+ 14, 124, 62, 10, 11, 1, 0, 37, 63, 1,
+ 1, 90, 1, 10, 1, 1, 5, 30, 1, 42,
+ 24, 50, 76, 124, 124, 14, 14, 11, 15, 4,
+ 50, 28, 20, 52, 14, 1, 78, 72, 20, 1,
+ 10, 10, 4, 14, 20, 14, 24, 40, 4, 1,
+ 1, 50, 4, 11, 1, 1, 50, 46, 36, 56,
+ 4, 14, 20, 14, 24, 40, 4, 1, 1, 50,
+ 4, 11, 1, 1, 50, 46, 36, 56, 68, 5,
+ 108, 1, 27, 1, 10, 14, 10, 36, 36, 76,
+ 20, 36, 5, 5, 58, 14, 1, 36, 5, 5,
+ 58, 14, 1, 36, 5, 5, 58, 14, 1, 27,
+ 14, 26, 26, 52, 68, 52, 68, 20, 46, 5,
+ 5, 46, 5, 5, 1, 16, 20, 20, 1, 30,
+ 20, 10, 10, 90, 78, 58, 14, 68, 58, 52,
+ 11, 54, 36, 20, 1, 30, 20, 10, 62, 20,
+ 88, 62, 62, 20,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 43 */
+
+ 14, 124, 64, 10, 11, 1, 1, 37, 65, 1,
+ 1, 90, 1, 10, 1, 1, 5, 30, 1, 42,
+ 26, 54, 80, 124, 124, 14, 14, 11, 17, 4,
+ 54, 24, 20, 52, 14, 0, 78, 74, 20, 1,
+ 10, 10, 4, 16, 20, 16, 26, 42, 4, 0,
+ 0, 54, 4, 11, 0, 0, 54, 48, 36, 58,
+ 4, 16, 20, 16, 26, 42, 4, 0, 0, 54,
+ 4, 11, 0, 0, 54, 48, 36, 58, 68, 5,
+ 112, 1, 27, 1, 10, 14, 10, 36, 36, 80,
+ 20, 36, 5, 5, 58, 14, 1, 36, 5, 5,
+ 58, 14, 1, 36, 5, 5, 58, 14, 1, 27,
+ 14, 26, 26, 52, 68, 52, 68, 20, 46, 5,
+ 5, 46, 5, 5, 1, 14, 20, 20, 1, 30,
+ 20, 10, 10, 90, 78, 58, 14, 68, 58, 52,
+ 11, 52, 36, 20, 1, 30, 20, 10, 64, 20,
+ 90, 64, 64, 20,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 44 */
+
+ 14, 124, 66, 10, 9, 1, 3, 39, 67, 1,
+ 1, 90, 1, 10, 1, 1, 7, 30, 1, 42,
+ 28, 56, 84, 124, 124, 14, 14, 11, 19, 4,
+ 56, 22, 20, 54, 14, 2, 78, 76, 20, 1,
+ 10, 10, 6, 18, 22, 18, 28, 44, 6, 2,
+ 2, 56, 6, 9, 2, 2, 56, 50, 38, 60,
+ 6, 18, 22, 18, 28, 44, 6, 2, 2, 56,
+ 6, 9, 2, 2, 56, 50, 38, 60, 70, 5,
+ 116, 1, 27, 1, 10, 14, 10, 38, 38, 84,
+ 22, 36, 7, 5, 58, 14, 1, 36, 7, 5,
+ 58, 14, 1, 36, 7, 5, 58, 14, 1, 27,
+ 14, 26, 26, 54, 70, 54, 70, 20, 46, 7,
+ 5, 46, 7, 5, 1, 12, 20, 20, 1, 30,
+ 20, 8, 8, 90, 78, 58, 14, 70, 58, 54,
+ 11, 48, 36, 20, 1, 30, 20, 8, 66, 20,
+ 92, 66, 66, 20,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 45 */
+
+ 14, 124, 68, 12, 7, 1, 5, 41, 69, 1,
+ 1, 92, 1, 12, 1, 1, 9, 30, 1, 44,
+ 32, 60, 88, 124, 124, 14, 14, 13, 21, 2,
+ 60, 18, 18, 56, 14, 4, 78, 80, 18, 1,
+ 12, 12, 8, 20, 24, 20, 32, 48, 8, 4,
+ 4, 60, 8, 7, 4, 4, 60, 52, 40, 64,
+ 8, 20, 24, 20, 32, 48, 8, 4, 4, 60,
+ 8, 7, 4, 4, 60, 52, 40, 64, 72, 3,
+ 120, 1, 29, 1, 12, 14, 12, 40, 40, 88,
+ 24, 34, 9, 3, 60, 14, 1, 34, 9, 3,
+ 60, 14, 1, 34, 9, 3, 60, 14, 1, 29,
+ 14, 28, 28, 56, 72, 56, 72, 18, 46, 9,
+ 3, 46, 9, 3, 1, 10, 18, 18, 1, 30,
+ 18, 6, 6, 92, 78, 60, 14, 72, 60, 56,
+ 13, 46, 34, 18, 1, 30, 18, 6, 68, 18,
+ 96, 68, 68, 18,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 46 */
+
+ 14, 124, 70, 12, 7, 1, 7, 41, 71, 1,
+ 1, 92, 1, 12, 1, 1, 9, 30, 1, 44,
+ 34, 62, 92, 124, 124, 14, 14, 13, 23, 2,
+ 62, 16, 18, 56, 14, 6, 78, 82, 18, 1,
+ 12, 12, 8, 22, 24, 22, 34, 50, 8, 6,
+ 6, 62, 8, 7, 6, 6, 62, 54, 40, 66,
+ 8, 22, 24, 22, 34, 50, 8, 6, 6, 62,
+ 8, 7, 6, 6, 62, 54, 40, 66, 72, 3,
+ 124, 1, 29, 1, 12, 14, 12, 40, 40, 92,
+ 24, 34, 9, 3, 60, 14, 1, 34, 9, 3,
+ 60, 14, 1, 34, 9, 3, 60, 14, 1, 29,
+ 14, 28, 28, 56, 72, 56, 72, 18, 46, 9,
+ 3, 46, 9, 3, 1, 8, 18, 18, 1, 30,
+ 18, 6, 6, 92, 78, 60, 14, 72, 60, 56,
+ 13, 44, 34, 18, 1, 30, 18, 6, 70, 18,
+ 98, 70, 70, 18,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 47 */
+
+ 14, 124, 72, 12, 5, 1, 9, 43, 73, 1,
+ 1, 92, 1, 12, 1, 1, 11, 30, 1, 44,
+ 36, 66, 96, 124, 124, 14, 14, 13, 25, 2,
+ 66, 12, 18, 58, 14, 8, 78, 84, 18, 1,
+ 12, 12, 10, 24, 26, 24, 36, 52, 10, 8,
+ 8, 66, 10, 5, 8, 8, 66, 56, 42, 68,
+ 10, 24, 26, 24, 36, 52, 10, 8, 8, 66,
+ 10, 5, 8, 8, 66, 56, 42, 68, 74, 3,
+ 124, 1, 29, 1, 12, 14, 12, 42, 42, 96,
+ 26, 34, 11, 3, 60, 14, 1, 34, 11, 3,
+ 60, 14, 1, 34, 11, 3, 60, 14, 1, 29,
+ 14, 28, 28, 58, 74, 58, 74, 18, 46, 11,
+ 3, 46, 11, 3, 1, 6, 18, 18, 1, 30,
+ 18, 4, 4, 92, 78, 60, 14, 74, 60, 58,
+ 13, 42, 34, 18, 1, 30, 18, 4, 72, 18,
+ 100, 72, 72, 18,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 48 */
+
+ 14, 124, 72, 12, 5, 1, 11, 45, 75, 1,
+ 1, 92, 1, 12, 1, 1, 13, 30, 1, 44,
+ 38, 68, 98, 124, 124, 14, 14, 15, 27, 0,
+ 68, 8, 16, 58, 14, 8, 78, 86, 16, 1,
+ 12, 12, 10, 24, 26, 24, 38, 54, 10, 8,
+ 8, 68, 10, 5, 8, 8, 68, 56, 42, 70,
+ 10, 24, 26, 24, 38, 54, 10, 8, 8, 68,
+ 10, 5, 8, 8, 68, 56, 42, 70, 74, 3,
+ 124, 1, 31, 1, 12, 14, 12, 42, 42, 98,
+ 26, 32, 13, 3, 60, 14, 1, 32, 13, 3,
+ 60, 14, 1, 32, 13, 3, 60, 14, 1, 31,
+ 14, 28, 28, 58, 74, 58, 74, 16, 46, 13,
+ 3, 46, 13, 3, 1, 4, 16, 16, 1, 30,
+ 16, 2, 2, 92, 78, 60, 14, 74, 60, 58,
+ 15, 38, 32, 16, 1, 30, 16, 2, 72, 16,
+ 102, 72, 72, 16,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 49 */
+
+ 14, 124, 74, 14, 3, 1, 11, 45, 75, 1,
+ 1, 94, 1, 14, 1, 1, 13, 30, 1, 46,
+ 42, 72, 102, 124, 124, 14, 14, 15, 27, 0,
+ 72, 6, 16, 60, 14, 10, 78, 90, 16, 1,
+ 14, 14, 12, 26, 28, 26, 42, 58, 12, 10,
+ 10, 72, 12, 3, 10, 10, 72, 58, 44, 74,
+ 12, 26, 28, 26, 42, 58, 12, 10, 10, 72,
+ 12, 3, 10, 10, 72, 58, 44, 74, 76, 1,
+ 124, 1, 31, 1, 14, 14, 14, 44, 44, 102,
+ 28, 32, 13, 1, 62, 14, 1, 32, 13, 1,
+ 62, 14, 1, 32, 13, 1, 62, 14, 1, 31,
+ 14, 30, 30, 60, 76, 60, 76, 16, 46, 13,
+ 1, 46, 13, 1, 1, 4, 16, 16, 1, 30,
+ 16, 2, 2, 94, 78, 62, 14, 76, 62, 60,
+ 15, 36, 32, 16, 1, 30, 16, 2, 74, 16,
+ 106, 74, 74, 16,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 50 */
+
+ 14, 124, 76, 14, 1, 1, 13, 47, 77, 1,
+ 1, 94, 1, 14, 1, 1, 15, 30, 1, 46,
+ 44, 76, 106, 124, 124, 14, 14, 15, 29, 0,
+ 76, 2, 16, 62, 14, 12, 78, 92, 16, 1,
+ 14, 14, 14, 28, 30, 28, 44, 60, 14, 12,
+ 12, 76, 14, 1, 12, 12, 76, 60, 46, 76,
+ 14, 28, 30, 28, 44, 60, 14, 12, 12, 76,
+ 14, 1, 12, 12, 76, 60, 46, 76, 78, 1,
+ 124, 1, 31, 1, 14, 14, 14, 46, 46, 106,
+ 30, 32, 15, 1, 62, 14, 1, 32, 15, 1,
+ 62, 14, 1, 32, 15, 1, 62, 14, 1, 31,
+ 14, 30, 30, 62, 78, 62, 78, 16, 46, 15,
+ 1, 46, 15, 1, 1, 2, 16, 16, 1, 30,
+ 16, 0, 0, 94, 78, 62, 14, 78, 62, 62,
+ 15, 34, 32, 16, 1, 30, 16, 0, 76, 16,
+ 108, 76, 76, 16,
+ },
+
+ {
+ /* Context Tables for init_idc = 2, qp = 51 */
+
+ 14, 124, 78, 14, 1, 1, 15, 47, 79, 1,
+ 1, 94, 1, 14, 1, 1, 15, 30, 1, 46,
+ 46, 78, 110, 124, 124, 14, 14, 15, 31, 0,
+ 78, 0, 16, 62, 14, 14, 78, 94, 16, 1,
+ 14, 14, 14, 30, 30, 30, 46, 62, 14, 14,
+ 14, 78, 14, 1, 14, 14, 78, 62, 46, 78,
+ 14, 30, 30, 30, 46, 62, 14, 14, 14, 78,
+ 14, 1, 14, 14, 78, 62, 46, 78, 78, 1,
+ 124, 1, 31, 1, 14, 14, 14, 46, 46, 110,
+ 30, 32, 15, 1, 62, 14, 1, 32, 15, 1,
+ 62, 14, 1, 32, 15, 1, 62, 14, 1, 31,
+ 14, 30, 30, 62, 78, 62, 78, 16, 46, 15,
+ 1, 46, 15, 1, 1, 0, 16, 16, 1, 30,
+ 16, 0, 0, 94, 78, 62, 14, 78, 62, 62,
+ 15, 32, 32, 16, 1, 30, 16, 0, 78, 16,
+ 110, 78, 78, 16,
+ },
+ },
+};
diff --git a/common/ihevc_cabac_tables.h b/common/ihevc_cabac_tables.h
new file mode 100644
index 0000000..9ed1a2c
--- /dev/null
+++ b/common/ihevc_cabac_tables.h
@@ -0,0 +1,137 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+******************************************************************************
+* @file ihevc_cabac_tables.h
+*
+* @brief
+* This file contains enumerations, macros and extern declarations of HEVC
+* cabac tables
+*
+* @author
+* Ittiam
+******************************************************************************
+*/
+
+#ifndef _IHEVC_CABAC_TABLES_H_
+#define _IHEVC_CABAC_TABLES_H_
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+/**
+******************************************************************************
+ * @brief maximum range of cabac_init_idc (0-2)
+******************************************************************************
+ */
+#define IHEVC_NUM_CAB_IDC 3
+
+/**
+******************************************************************************
+ * @brief max range of qps in HEVC (0-51)
+******************************************************************************
+ */
+#define IHEVC_MAX_QP 52
+
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+
+/**
+ *****************************************************************************
+ * @brief start offsets of cabac contexts for various syntax elements
+ *
+ * @remarks Init ContextModel generation and these offsets are tightly coupled
+ See the cabac table generation utility for these offsets
+ *****************************************************************************
+ */
+typedef enum
+{
+ IHEVC_CAB_SAO_MERGE = 0,
+ IHEVC_CAB_SAO_TYPE = IHEVC_CAB_SAO_MERGE + 1,
+ IHEVC_CAB_SPLIT_CU_FLAG = IHEVC_CAB_SAO_TYPE + 1,
+ IHEVC_CAB_CU_TQ_BYPASS_FLAG = IHEVC_CAB_SPLIT_CU_FLAG + 3,
+ IHEVC_CAB_SKIP_FLAG = IHEVC_CAB_CU_TQ_BYPASS_FLAG + 1,
+ IHEVC_CAB_QP_DELTA_ABS = IHEVC_CAB_SKIP_FLAG + 3,
+ IHEVC_CAB_PRED_MODE = IHEVC_CAB_QP_DELTA_ABS + 2,
+ IHEVC_CAB_PART_MODE = IHEVC_CAB_PRED_MODE + 1,
+ IHEVC_CAB_INTRA_LUMA_PRED_FLAG = IHEVC_CAB_PART_MODE + 4,
+ IHEVC_CAB_CHROMA_PRED_MODE = IHEVC_CAB_INTRA_LUMA_PRED_FLAG + 1,
+ IHEVC_CAB_MERGE_FLAG_EXT = IHEVC_CAB_CHROMA_PRED_MODE + 1,
+ IHEVC_CAB_MERGE_IDX_EXT = IHEVC_CAB_MERGE_FLAG_EXT + 1,
+ IHEVC_CAB_INTER_PRED_IDC = IHEVC_CAB_MERGE_IDX_EXT + 1,
+ IHEVC_CAB_INTER_REF_IDX = IHEVC_CAB_INTER_PRED_IDC + 5,
+ IHEVC_CAB_MVD_GRT0 = IHEVC_CAB_INTER_REF_IDX + 2,
+ IHEVC_CAB_MVD_GRT1 = IHEVC_CAB_MVD_GRT0 + 1,
+ IHEVC_CAB_MVP_L0L1 = IHEVC_CAB_MVD_GRT1 + 1,
+ IHEVC_CAB_NORES_IDX = IHEVC_CAB_MVP_L0L1 + 1,
+ IHEVC_CAB_SPLIT_TFM = IHEVC_CAB_NORES_IDX + 1,
+ IHEVC_CAB_CBF_LUMA_IDX = IHEVC_CAB_SPLIT_TFM + 3,
+ IHEVC_CAB_CBCR_IDX = IHEVC_CAB_CBF_LUMA_IDX + 2,
+ IHEVC_CAB_TFM_SKIP0 = IHEVC_CAB_CBCR_IDX + 4,
+ IHEVC_CAB_TFM_SKIP12 = IHEVC_CAB_TFM_SKIP0 + 1,
+ IHEVC_CAB_COEFFX_PREFIX = IHEVC_CAB_TFM_SKIP12 + 1,
+ IHEVC_CAB_COEFFY_PREFIX = IHEVC_CAB_COEFFX_PREFIX + 18,
+ IHEVC_CAB_CODED_SUBLK_IDX = IHEVC_CAB_COEFFY_PREFIX + 18,
+ IHEVC_CAB_COEFF_FLAG = IHEVC_CAB_CODED_SUBLK_IDX + 4,
+ IHEVC_CAB_COEFABS_GRTR1_FLAG = IHEVC_CAB_COEFF_FLAG + 42,
+ IHEVC_CAB_COEFABS_GRTR2_FLAG = IHEVC_CAB_COEFABS_GRTR1_FLAG + 24,
+ IHEVC_CAB_CTXT_END = IHEVC_CAB_COEFABS_GRTR2_FLAG + 6
+}IHEVC_CABAC_CTXT_OFFSETS;
+
+
+/*****************************************************************************/
+/* Extern global declarations */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ * @brief Table for rangeTabLPS depending on pStateIdx and qCodIRangeIdx
+ * input : pStateIdx(0-63) and qCodIRangeIdx(0-3) [(Range >> 6) & 0x3]
+ * output : RLps
+ *
+ * @remarks See Table 9-40 of HEVC spec for rangeTabLPS
+ *******************************************************************************
+ */
+extern const UWORD8 gau1_ihevc_cabac_rlps[64][4];
+
+
+/**
+ ******************************************************************************
+ * @brief probaility+MPS state transition tables based on cur State and bin
+ * input : curpState[bits7-2] | curMPS[bit1] | decodedBin[bit0]
+ * output : nextpState[bits6-1] | nextMPS[bit0]
+ * @remarks Modified form of Table-9-41 State Transition table in HEVC spec
+ ******************************************************************************
+ */
+extern const UWORD8 gau1_ihevc_next_state[128*2];
+
+/**
+ ******************************************************************************
+ * @brief Init context tables for all combinations of qp and cabac_init_idc
+ * @remarks Packing format MPS in lsb and pState in bits[1-6]
+ ******************************************************************************
+ */
+extern const UWORD8 gau1_ihevc_cab_ctxts[IHEVC_NUM_CAB_IDC][IHEVC_MAX_QP][IHEVC_CAB_CTXT_END];
+
+
+
+#endif /* _IHEVC_CABAC_TABLES_H_ */
diff --git a/common/ihevc_chroma_intra_pred.h b/common/ihevc_chroma_intra_pred.h
new file mode 100644
index 0000000..c4ca13b
--- /dev/null
+++ b/common/ihevc_chroma_intra_pred.h
@@ -0,0 +1,358 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_intra_pred.h
+*
+* @brief
+* Declarations for the fucntions defined in ihevc_intra_pred_filters
+*
+* @author
+* Mamatha
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_CHROMA_INTRA_PRED_H_
+#define IHEVC_CHROMA_INTRA_PRED_H_
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+typedef void ihevc_intra_pred_chroma_planar_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_dc_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_horz_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_ver_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode2_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_18_34_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_3_to_9_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_11_to_17_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_19_to_25_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_mode_27_to_33_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_chroma_ref_substitution_ft(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd);
+
+typedef void ihevc_hbd_intra_pred_chroma_planar_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_dc_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_horz_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_hbd_intra_pred_chroma_ver_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode2_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_18_34_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_3_to_9_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_11_to_17_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_19_to_25_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_mode_27_to_33_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_hbd_intra_pred_chroma_ref_substitution_ft(UWORD16 *pu2_top_left,
+ UWORD16 *pu2_top,
+ UWORD16 *pu2_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ UWORD8 bit_depth);
+
+/* C function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution;
+
+ihevc_hbd_intra_pred_chroma_planar_ft ihevc_hbd_intra_pred_chroma_planar;
+ihevc_hbd_intra_pred_chroma_dc_ft ihevc_hbd_intra_pred_chroma_dc;
+ihevc_hbd_intra_pred_chroma_horz_ft ihevc_hbd_intra_pred_chroma_horz;
+ihevc_hbd_intra_pred_chroma_ver_ft ihevc_hbd_intra_pred_chroma_ver;
+ihevc_hbd_intra_pred_chroma_mode2_ft ihevc_hbd_intra_pred_chroma_mode2;
+ihevc_hbd_intra_pred_chroma_mode_18_34_ft ihevc_hbd_intra_pred_chroma_mode_18_34;
+ihevc_hbd_intra_pred_chroma_mode_3_to_9_ft ihevc_hbd_intra_pred_chroma_mode_3_to_9;
+ihevc_hbd_intra_pred_chroma_mode_11_to_17_ft ihevc_hbd_intra_pred_chroma_mode_11_to_17;
+ihevc_hbd_intra_pred_chroma_mode_19_to_25_ft ihevc_hbd_intra_pred_chroma_mode_19_to_25;
+ihevc_hbd_intra_pred_chroma_mode_27_to_33_ft ihevc_hbd_intra_pred_chroma_mode_27_to_33;
+ihevc_hbd_intra_pred_chroma_ref_substitution_ft ihevc_hbd_intra_pred_chroma_ref_substitution;
+
+/* C function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution;
+
+/* A9Q function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_a9q;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_a9q;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_a9q;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_a9q;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_a9q;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_a9q;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_a9q;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_a9q;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_a9q;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_a9q;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_a9q;
+
+/* SSE4.2 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_neonintr;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_neonintr;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_neonintr;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_neonintr;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_neonintr;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_neonintr;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_neonintr;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_neonintr;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_neonintr;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_neonintr;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_neonintr;
+
+/* SSSE3 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_ssse3;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_ssse3;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_ssse3;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_ssse3;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_ssse3;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_ssse3;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_ssse3;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_ssse3;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_ssse3;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_ssse3;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_ssse3;
+
+/* SSE4.2 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_sse42;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_sse42;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_sse42;
+
+ihevc_hbd_intra_pred_chroma_planar_ft ihevc_hbd_intra_pred_chroma_planar_sse42;
+ihevc_hbd_intra_pred_chroma_dc_ft ihevc_hbd_intra_pred_chroma_dc_sse42;
+ihevc_hbd_intra_pred_chroma_horz_ft ihevc_hbd_intra_pred_chroma_horz_sse42;
+ihevc_hbd_intra_pred_chroma_ver_ft ihevc_hbd_intra_pred_chroma_ver_sse42;
+ihevc_hbd_intra_pred_chroma_mode2_ft ihevc_hbd_intra_pred_chroma_mode2_sse42;
+ihevc_hbd_intra_pred_chroma_mode_18_34_ft ihevc_hbd_intra_pred_chroma_mode_18_34_sse42;
+ihevc_hbd_intra_pred_chroma_mode_3_to_9_ft ihevc_hbd_intra_pred_chroma_mode_3_to_9_sse42;
+ihevc_hbd_intra_pred_chroma_mode_11_to_17_ft ihevc_hbd_intra_pred_chroma_mode_11_to_17_sse42;
+ihevc_hbd_intra_pred_chroma_mode_19_to_25_ft ihevc_hbd_intra_pred_chroma_mode_19_to_25_sse42;
+ihevc_hbd_intra_pred_chroma_mode_27_to_33_ft ihevc_hbd_intra_pred_chroma_mode_27_to_33_sse42;
+ihevc_hbd_intra_pred_chroma_ref_substitution_ft ihevc_hbd_intra_pred_chroma_ref_substitution_sse42;
+
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_a9a;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_a9a;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_a9a;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_a9a;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_a9a;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_a9a;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_a9a;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_a9a;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_a9a;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_a9a;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_a9a;
+
+/* AVX function declaration*/
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_avx;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_avx;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_avx;
+
+ihevc_hbd_intra_pred_chroma_dc_ft ihevc_hbd_intra_pred_chroma_dc_avx;
+ihevc_hbd_intra_pred_chroma_mode_18_34_ft ihevc_hbd_intra_pred_chroma_mode_18_34_avx;
+ihevc_hbd_intra_pred_chroma_ver_ft ihevc_hbd_intra_pred_chroma_ver_avx;
+
+/* armv8 function declarations */
+ihevc_intra_pred_chroma_planar_ft ihevc_intra_pred_chroma_planar_av8;
+ihevc_intra_pred_chroma_dc_ft ihevc_intra_pred_chroma_dc_av8;
+ihevc_intra_pred_chroma_horz_ft ihevc_intra_pred_chroma_horz_av8;
+ihevc_intra_pred_chroma_ver_ft ihevc_intra_pred_chroma_ver_av8;
+ihevc_intra_pred_chroma_mode2_ft ihevc_intra_pred_chroma_mode2_av8;
+ihevc_intra_pred_chroma_mode_18_34_ft ihevc_intra_pred_chroma_mode_18_34_av8;
+ihevc_intra_pred_chroma_mode_3_to_9_ft ihevc_intra_pred_chroma_mode_3_to_9_av8;
+ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17_av8;
+ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25_av8;
+ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33_av8;
+ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution_av8;
+#endif /* IHEVC_CHROMA_INTRA_PRED_H_ */
diff --git a/common/ihevc_chroma_intra_pred_filters.c b/common/ihevc_chroma_intra_pred_filters.c
new file mode 100644
index 0000000..8b3c992
--- /dev/null
+++ b/common/ihevc_chroma_intra_pred_filters.c
@@ -0,0 +1,1277 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_intra_pred_filters.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* ihevc_intra_pred_chroma_planar()
+*
+* ihevc_intra_pred_chroma_dc()
+*
+* ihevc_intra_pred_chroma_horz()
+*
+* ihevc_intra_pred_chroma_ver()
+*
+* ihevc_intra_pred_chroma_mode2()
+*
+* ihevc_intra_pred_chroma_mode_18_34()
+*
+* ihevc_intra_pred_chroma_mode_3_to_9()
+*
+* ihevc_intra_pred_chroma_mode_11_to_17()
+*
+* ihevc_intra_pred_chroma_mode_19_to_25()
+*
+* ihevc_intra_pred_chroma_mode_27_to_33()
+*
+* ihevc_intra_pred_chroma_ref_substitution()
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+* Reference substitution process for samples unavailable for prediction
+* Refer to section 8.4.4.2.2
+*
+* @par Description:
+*
+*
+* @param[in] pu1_top_left
+* UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+* UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+* UWORD8 pointer to the left
+*
+* @param[in] src_strd
+* WORD32 Source stride
+*
+* @param[in] nbr_flags
+* WORD32 neighbor availability flags
+*
+* @param[in] nt
+* WORD32 transform Block size
+*
+* @param[in] dst_strd
+* WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd)
+{
+ UWORD8 pu1_ref_u, pu1_ref_v;
+ WORD32 dc_val, i, j;
+ WORD32 total_samples = (4 * nt) + 1;
+ WORD32 get_bits;
+ WORD32 next;
+ WORD32 bot_left, left, top, tp_right, tp_left;
+ WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+ WORD32 a_nbr_flag[5];
+ UNUSED(dst_strd);
+ /* Neighbor Flag Structure*/
+ /* WORD32 nbr_flags MSB-->LSB TOP LEFT | TOP-RIGHT | TOP | LEFT | BOTTOM LEFT*/
+ /* (1 bit) (4 bits) (4 bits) (4 bits) (4 bits) */
+
+ if(nbr_flags == 0)
+ {
+/* If no neighbor flags are present, fill the neighbor samples with DC value */
+ /*dc_val = 1 << (BIT_DEPTH - 1);*/
+ dc_val = 1 << (8 - 1);
+ for(i = 0; i < (2 * total_samples); i++)
+ {
+ pu1_dst[i] = dc_val;
+ }
+ }
+ else
+ {
+ /* Else fill the corresponding samples */
+
+ /* Check for the neighbors availibility */
+ tp_left = (nbr_flags & 0x10000);
+ tp_right = (nbr_flags & 0x0f000);
+ top = (nbr_flags & 0x00f00);
+ left = (nbr_flags & 0x000f0);
+ bot_left = (nbr_flags & 0x0000f);
+
+ /* Fill nbrs depending on avalibility */
+ /* Top -Left nbrs */
+ if(0 != tp_left)
+ {
+ pu1_dst[(4 * nt)] = *pu1_top_left; // U top-left sample
+ pu1_dst[(4 * nt) + 1] = *(pu1_top_left + 1); // V top-left sample
+ }
+ /* Left nbrs */
+ if(0 != left)
+ {
+ for(i = 0, j = 0; i < (2 * nt); i += 2)
+ {
+ pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+ pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+ j++;
+ }
+ }
+ /* Bottom - Left nbrs */
+ if(0 != bot_left)
+ {
+ for(i = (2 * nt), j = nt; i < (4 * nt); i += 2)
+ {
+ pu1_dst[(4 * nt) - 2 - i] = pu1_left[j * src_strd]; // U left samples
+ pu1_dst[(4 * nt) - 1 - i] = pu1_left[(j * src_strd) + 1]; // V left samples
+ j++;
+ }
+ }
+ /* Top nbrs */
+ if(0 != top)
+ {
+ ihevc_memcpy_mul_8(&pu1_dst[(4 * nt) + 2], pu1_top, 2 * nt);
+ // U-V interleaved Top-top right samples
+ }
+
+ /* Top - Right nbrs */
+ if(0 != tp_right)
+ {
+ ihevc_memcpy_mul_8(&pu1_dst[(4 * nt) + 2 + 2 * nt], pu1_top + 2 * nt, 2 * nt);
+ // U-V interleaved Top-top right samples
+ }
+
+ if(nt == 4)
+ {
+ /* 1 bit extraction for all the neighboring blocks */
+ tp_left = (nbr_flags & 0x10000) >> 16;
+ bot_left = (nbr_flags & 0x8) >> 3;
+ left = (nbr_flags & 0x80) >> 7;
+ top = (nbr_flags & 0x100) >> 8;
+ tp_right = (nbr_flags & 0x1000) >> 12;
+
+ next = 1;
+ a_nbr_flag[0] = bot_left;
+ a_nbr_flag[1] = left;
+ a_nbr_flag[2] = tp_left;
+ a_nbr_flag[3] = top;
+ a_nbr_flag[4] = tp_right;
+
+ /* If bottom -left is not available, reverse substitution process*/
+ if(bot_left == 0)
+ {
+ /* Check for the 1st available sample from bottom-left*/
+ while(!a_nbr_flag[next])
+ next++;
+
+ /* If Left, top-left are available*/
+ if(next <= 2)
+ {
+ UWORD16 *pu2_dst;
+ idx = (nt * next);
+ pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+ ihevc_memset_16bit((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+ }
+ else /* If top, top-right are available */
+ {
+ UWORD16 *pu2_dst;
+ /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+ idx = (nt * (next - 1)) + 1;
+ pu2_dst = (UWORD16 *)&pu1_dst[2 * idx];
+ ihevc_memset_16bit((UWORD16 *)pu1_dst, pu2_dst[0], idx);
+ }
+ }
+
+ if(left == 0)
+ {
+ UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(2 * nt) - 2];
+ ihevc_memset_16bit((UWORD16 *)&pu1_dst[(2 * nt)], pu2_dst[0], nt);
+
+
+ }
+ if(tp_left == 0)
+ {
+ pu1_dst[4 * nt] = pu1_dst[(4 * nt) - 2];
+ pu1_dst[(4 * nt) + 1] = pu1_dst[(4 * nt) - 1];
+ }
+ if(top == 0)
+ {
+ UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(4 * nt)];
+ ihevc_memset_16bit((UWORD16 *)&pu1_dst[(4 * nt) + 2], pu2_dst[0], nt);
+
+
+ }
+ if(tp_right == 0)
+ {
+ UWORD16 *pu2_dst = (UWORD16 *)&pu1_dst[(6 * nt)];
+ ihevc_memset_16bit((UWORD16 *)&pu1_dst[(6 * nt) + 2], pu2_dst[0], nt);
+
+
+ }
+ }
+ else if(nt == 8)
+ {
+ WORD32 nbr_flags_temp = 0;
+ nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+ + ((nbr_flags & 0x300) >> 4)
+ + ((nbr_flags & 0x3000) >> 6)
+ + ((nbr_flags & 0x10000) >> 8);
+
+ /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 4; /* for bottom left and left */
+ if(nbr_id_from_bl == 32)
+ nbr_id_from_bl = 16;
+ if(nbr_id_from_bl == 16)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags_temp >> 8) & 0x1))
+ {
+ nbr_id_from_bl++;
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4; /* top and top right; 8 pels per nbr bit */
+
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+ pu1_ref_v = pu1_dst[(2 * nbr_id_from_bl) + 1];
+ for(i = 2 * (nbr_id_from_bl - 1); i >= 0; i -= 2)
+ {
+ pu1_dst[i] = pu1_ref_u;
+ pu1_dst[i + 1] = pu1_ref_v;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T8C_4NT)+1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Divide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T8C_4NT / 2))
+ {
+ get_bits = GET_BIT(nbr_flags_temp, 8);
+
+ /* only pel substitution for TL */
+ if(!get_bits)
+ {
+ pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+ pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+ }
+ }
+ else
+ {
+ get_bits = GET_BIT(nbr_flags_temp, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ UWORD16 *pu2_dst;
+ /* 8 pel substitution (other than TL) */
+ pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+ ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT / 2)) ? 1 : 4;
+ }
+
+ }
+ else if(nt == 16)
+ {
+ /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 4 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4; /* for bottom left and left */
+
+ if(nbr_id_from_bl == 32)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags >> 16) & 0x1))
+ {
+ /* top left not available */
+ nbr_id_from_bl++;
+ /* top and top right; 4 pels per nbr bit */
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref_u = pu1_dst[2 * nbr_id_from_bl];
+ pu1_ref_v = pu1_dst[2 * nbr_id_from_bl + 1];
+ for(i = (2 * (nbr_id_from_bl - 1)); i >= 0; i -= 2)
+ {
+ pu1_dst[i] = pu1_ref_u;
+ pu1_dst[i + 1] = pu1_ref_v;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T16C_4NT)+1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 4 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T16C_4NT / 2))
+ {
+ get_bits = GET_BIT(nbr_flags, 16);
+ /* only pel substitution for TL */
+ if(!get_bits)
+ {
+ pu1_dst[2 * nbr_id_from_bl] = pu1_dst[(2 * nbr_id_from_bl) - 2];
+ pu1_dst[(2 * nbr_id_from_bl) + 1] = pu1_dst[(2 * nbr_id_from_bl) - 1];
+ }
+ }
+ else
+ {
+ get_bits = GET_BIT(nbr_flags, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ UWORD16 *pu2_dst;
+ /* 4 pel substitution (other than TL) */
+ pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
+ ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT / 2)) ? 1 : 4;
+ }
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_planar(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+ UNUSED(src_strd);
+ UNUSED(mode);
+ switch(nt)
+ {
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+ /* Planar filtering */
+ for(row = 0; row < nt; row++)
+ {
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ pu1_dst[row * dst_strd + col] = ((nt - 1 - col / 2)
+ * pu1_ref[2 * (two_nt - 1 - row)]
+ + (col / 2 + 1) * pu1_ref[2 * (three_nt + 1)]
+ + (nt - 1 - row) * pu1_ref[2 * (two_nt + 1) + col]
+ + (row + 1) * pu1_ref[2 * (nt - 1)] + nt) >> (log2nt + 1);
+
+ pu1_dst[row * dst_strd + col + 1] = ((nt - 1 - col / 2)
+ * pu1_ref[2 * (two_nt - 1 - row) + 1]
+ + (col / 2 + 1) * pu1_ref[2 * (three_nt + 1) + 1]
+ + (nt - 1 - row) * pu1_ref[2 * (two_nt + 1) + col + 1]
+ + (row + 1) * pu1_ref[2 * (nt - 1) + 1] + nt) >> (log2nt + 1);
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size (Chroma)
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_dc(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 acc_dc_u, acc_dc_v;
+ WORD32 dc_val_u, dc_val_v;
+ WORD32 i;
+ WORD32 row, col;
+ WORD32 log2nt = 5;
+ UNUSED(mode);
+ UNUSED(src_strd);
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+
+
+ acc_dc_u = 0;
+ acc_dc_v = 0;
+ /* Calculate DC value for the transform block */
+ for(i = (2 * nt); i < (4 * nt); i += 2)
+ {
+ acc_dc_u += pu1_ref[i];
+ acc_dc_v += pu1_ref[i + 1];
+ }
+ for(i = ((4 * nt) + 2); i < ((6 * nt) + 2); i += 2)
+ {
+ acc_dc_u += pu1_ref[i];
+ acc_dc_v += pu1_ref[i + 1];
+ }
+
+
+ dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
+ dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
+
+
+ /* Fill the remaining rows with DC value*/
+ for(row = 0; row < nt; row++)
+ {
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ pu1_dst[(row * dst_strd) + col] = dc_val_u;
+ pu1_dst[(row * dst_strd) + col + 1] = dc_val_v;
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Horizontal intraprediction(mode 10) with reference samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_horz(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ UNUSED(mode);
+ UNUSED(src_strd);
+ /* Replication to next rows*/
+ for(row = 0; row < nt; row++)
+ {
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ pu1_dst[(row * dst_strd) + col] = pu1_ref[(4 * nt) - 2 - 2 * row];
+ pu1_dst[(row * dst_strd) + col + 1] = pu1_ref[(4 * nt) - 1 - 2 * row];
+ }
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Horizontal intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_ver(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ UNUSED(mode);
+ UNUSED(src_strd);
+ /* Replication to next columns*/
+ for(row = 0; row < nt; row++)
+ {
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ pu1_dst[(row * dst_strd) + col] = pu1_ref[(4 * nt) + 2 + col];
+ pu1_dst[(row * dst_strd) + col + 1] = pu1_ref[(4 * nt) + 3 + col];
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 2 (sw angle) with reference neighboring samples
+* location pointed by 'pu1_ref' to the TU block location pointed by
+* 'pu1_dst' Refer to section 8.4.4.2.6 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode2(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+
+ WORD32 intra_pred_ang = 32;
+ WORD32 idx_u, idx_v;
+ UNUSED(src_strd);
+ UNUSED(mode);
+ /* For the angle 45, replication is done from the corresponding angle */
+ /* intra_pred_ang = tan(angle) in q5 format */
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ idx_u = ((col + 1) * intra_pred_ang) >> 5; /* Use idx++ */
+ idx_v = (((col + 1) + 1) * intra_pred_ang) >> 5; /* Use idx++ */
+ for(row = 0; row < nt; row++)
+ {
+ pu1_dst[col + (row * dst_strd)] = pu1_ref[(4 * nt) - 2 * row - idx_u - 3];
+ pu1_dst[(col + 1) + (row * dst_strd)] = pu1_ref[(4 * nt) - 2 * row - idx_v - 1];
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_18_34(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 intra_pred_ang;
+ WORD32 idx = 0;
+ UNUSED(src_strd);
+ intra_pred_ang = 32; /*Default value*/
+ /* For mode 18, angle is -45degree */
+ if(mode == 18)
+ intra_pred_ang = -32;
+ /* For mode 34, angle is 45degree */
+ else if(mode == 34)
+ intra_pred_ang = 32;
+ /* For the angle 45 and -45, replication is done from the corresponding angle */
+ /* No interpolation is done for 45 degree*/
+ for(row = 0; row < nt; row++)
+ {
+ idx = ((row + 1) * intra_pred_ang) >> 5;
+
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ pu1_dst[col + (row * dst_strd)] = pu1_ref[(4 * nt) + col + 2 * idx + 2];
+ pu1_dst[(col + 1) + (row * dst_strd)] = pu1_ref[(4 * nt) + (col + 1) + 2 * idx + 2];
+ }
+
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_3_to_9(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+
+ WORD32 intra_pred_ang;
+ WORD32 idx_u, ref_main_idx_u;
+ WORD32 idx_v, ref_main_idx_v;
+ WORD32 pos_u, fract_u;
+ WORD32 pos_v, fract_v;
+ UNUSED(src_strd);
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ pos_u = ((col / 2 + 1) * intra_pred_ang);
+ pos_v = ((col / 2 + 1) * intra_pred_ang);
+
+ idx_u = pos_u >> 5;
+ fract_u = pos_u & (31);
+
+ idx_v = pos_v >> 5;
+ fract_v = pos_v & (31);
+ // Do linear filtering
+ for(row = 0; row < nt; row++)
+ {
+ ref_main_idx_u = (4 * nt) - 2 * row - 2 * idx_u - 2;
+ ref_main_idx_v = (4 * nt) - 2 * row - 2 * idx_v - 1;
+
+ pu1_dst[col + (row * dst_strd)] = (((32 - fract_u)
+ * pu1_ref[ref_main_idx_u]
+ + fract_u * pu1_ref[ref_main_idx_u - 2] + 16) >> 5);
+
+ pu1_dst[(col + 1) + (row * dst_strd)] = (((32 - fract_v)
+ * pu1_ref[ref_main_idx_v]
+ + fract_v * pu1_ref[ref_main_idx_v - 2] + 16) >> 5);
+ }
+
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
+* with reference neighboring samples location pointed by 'pu1_ref' to the
+* TU block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_11_to_17(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
+ /* for ref main & side samples assignment,can be combined for */
+ /* optimzation*/
+
+ WORD32 row, col, k;
+ WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+ WORD32 idx_u, idx_v, ref_main_idx_u, ref_main_idx_v, ref_idx;
+ WORD32 pos_u, pos_v, fract_u, fract_v;
+
+ UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
+ UWORD8 *ref_main;
+ UNUSED(src_strd);
+ inv_ang_sum = 128;
+
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+
+ /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+
+ ref_main = ref_temp + 2 * nt;
+ for(k = 0; k < (2 * (nt + 1)); k += 2)
+ {
+ ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
+ ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
+ }
+
+ ref_main = ref_temp + (2 * (nt - 1));
+ ref_idx = (nt * intra_pred_ang) >> 5;
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -2; k > (2 * ref_idx); k -= 2)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
+ ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
+ }
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ pos_u = ((col / 2 + 1) * intra_pred_ang);
+ pos_v = ((col / 2 + 1) * intra_pred_ang);
+ idx_u = pos_u >> 5;
+ idx_v = pos_v >> 5;
+ fract_u = pos_u & (31);
+ fract_v = pos_v & (31);
+
+ // Do linear filtering
+ for(row = 0; row < nt; row++)
+ {
+ ref_main_idx_u = 2 * (row + idx_u + 1);
+ ref_main_idx_v = 2 * (row + idx_v + 1) + 1;
+
+ pu1_dst[col + (dst_strd * row)] = (UWORD8)(((32 - fract_u)
+ * ref_main[ref_main_idx_u]
+ + fract_u * ref_main[ref_main_idx_u + 2] + 16) >> 5);
+ pu1_dst[(col + 1) + (dst_strd * row)] = (UWORD8)(((32 - fract_v)
+ * ref_main[ref_main_idx_v]
+ + fract_v * ref_main[ref_main_idx_v + 2] + 16) >> 5);
+
+ }
+
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_19_to_25(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col, k;
+ WORD32 intra_pred_ang, idx;
+ WORD32 inv_ang, inv_ang_sum, pos, fract;
+ WORD32 ref_main_idx_u, ref_main_idx_v, ref_idx;
+ UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
+ UWORD8 *ref_main;
+ UNUSED(src_strd);
+
+
+ intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+ inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_main = ref_temp + 2 * nt;
+ for(k = 0; k < (2 * (nt + 1)); k += 2)
+ {
+ ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
+ ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
+ }
+
+
+ ref_idx = (nt * intra_pred_ang) >> 5;
+ inv_ang_sum = 128;
+ ref_main = ref_temp + (2 * (nt - 1));
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -2; k > (2 * ref_idx); k -= 2)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
+ ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
+ }
+
+ for(row = 0; row < nt; row++)
+ {
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+
+ // Do linear filtering
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ ref_main_idx_u = col + 2 * idx + 2;
+ ref_main_idx_v = (col + 1) + 2 * idx + 2;
+ pu1_dst[(row * dst_strd) + col] = (UWORD8)(((32 - fract)
+ * ref_main[ref_main_idx_u]
+ + fract * ref_main[ref_main_idx_u + 2] + 16) >> 5);
+ pu1_dst[(row * dst_strd) + (col + 1)] = (UWORD8)(((32 - fract)
+ * ref_main[ref_main_idx_v]
+ + fract * ref_main[ref_main_idx_v + 2] + 16) >> 5);
+
+ }
+
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_27_to_33(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 pos, fract;
+ WORD32 intra_pred_ang;
+ WORD32 idx, ref_main_idx_u, ref_main_idx_v;
+ UNUSED(src_strd);
+
+
+ intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+
+ for(row = 0; row < nt; row++)
+ {
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+
+
+ // Do linear filtering
+ for(col = 0; col < (2 * nt); col += 2)
+ {
+ ref_main_idx_u = (4 * nt) + col + 2 * idx + 2;
+ ref_main_idx_v = (4 * nt) + (col + 1) + 2 * idx + 2;
+ pu1_dst[col + (row * dst_strd)] = (((32 - fract)
+ * pu1_ref[ref_main_idx_u]
+ + fract * pu1_ref[ref_main_idx_u + 2] + 16) >> 5);
+ pu1_dst[(col + 1) + (row * dst_strd)] = (((32 - fract)
+ * pu1_ref[ref_main_idx_v]
+ + fract * pu1_ref[ref_main_idx_v + 2] + 16) >> 5);
+
+ }
+ }
+
+}
+
diff --git a/common/ihevc_chroma_iquant_itrans_recon.c b/common/ihevc_chroma_iquant_itrans_recon.c
new file mode 100644
index 0000000..479aebd
--- /dev/null
+++ b/common/ihevc_chroma_iquant_itrans_recon.c
@@ -0,0 +1,256 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_chroma_iquant_itrans_recon.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction of chroma interleaved data.
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_chroma_iquant_itrans_recon_4x4()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @param[in] zero_rows
+ * Zero Rows in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ UNUSED(zero_rows);
+
+ /* Inverse Transform */
+ {
+ WORD32 j;
+ WORD32 e[2], o[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 2;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_4;
+ pi2_tmp_orig = pi2_tmp;
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ WORD32 iq_tmp_1, iq_tmp_2;
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[1 * src_strd],
+ pi2_dequant_coeff[1 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[3 * src_strd],
+ pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+
+ o[0] = g_ai2_ihevc_trans_4[1][0] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[3][0] * iq_tmp_2;
+ o[1] = g_ai2_ihevc_trans_4[1][1] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[3][1] * iq_tmp_2;
+
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[0 * src_strd],
+ pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[2 * src_strd],
+ pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+
+ e[0] = g_ai2_ihevc_trans_4[0][0] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[2][0] * iq_tmp_2;
+ e[1] = g_ai2_ihevc_trans_4[0][1] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[2][1] * iq_tmp_2;
+
+ pi2_tmp[0] =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pi2_tmp[1] =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pi2_tmp[2] =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pi2_tmp[3] =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 itrans_out;
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][0]
+ * pi2_tmp[3 * trans_size];
+ o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][1]
+ * pi2_tmp[3 * trans_size];
+ e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][0]
+ * pi2_tmp[2 * trans_size];
+ e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][1]
+ * pi2_tmp[2 * trans_size];
+
+ itrans_out =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pu1_dst[0 * 2] = CLIP_U8((itrans_out + pu1_pred[0 * 2]));
+
+ itrans_out =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pu1_dst[1 * 2] = CLIP_U8((itrans_out + pu1_pred[1 * 2]));
+
+ itrans_out =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pu1_dst[2 * 2] = CLIP_U8((itrans_out + pu1_pred[2 * 2]));
+
+ itrans_out =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+ pu1_dst[3 * 2] = CLIP_U8((itrans_out + pu1_pred[3 * 2]));
+
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+
+ }
+ }
+}
diff --git a/common/ihevc_chroma_iquant_itrans_recon.h b/common/ihevc_chroma_iquant_itrans_recon.h
new file mode 100644
index 0000000..1cacfc5
--- /dev/null
+++ b/common/ihevc_chroma_iquant_itrans_recon.h
@@ -0,0 +1,135 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_iquant_itrans_recon.h
+*
+* @brief
+* Functions declarations for inverse quantization, inverse transform and
+* reconstruction of chroma interleaved data.
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_IQUANT_ITRANS_RECON_H_
+#define _IHEVC_CHROMA_IQUANT_ITRANS_RECON_H_
+
+typedef void ihevc_chroma_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_chroma_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+typedef void ihevc_chroma_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_chroma_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+typedef void ihevc_chroma_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_chroma_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+ihevc_chroma_iquant_itrans_recon_4x4_ft ihevc_chroma_iquant_itrans_recon_4x4;
+ihevc_hbd_chroma_iquant_itrans_recon_4x4_ft ihevc_hbd_chroma_iquant_itrans_recon_4x4;
+ihevc_chroma_iquant_itrans_recon_8x8_ft ihevc_chroma_iquant_itrans_recon_8x8;
+ihevc_hbd_chroma_iquant_itrans_recon_8x8_ft ihevc_hbd_chroma_iquant_itrans_recon_8x8;
+ihevc_chroma_iquant_itrans_recon_16x16_ft ihevc_chroma_iquant_itrans_recon_16x16;
+ihevc_hbd_chroma_iquant_itrans_recon_16x16_ft ihevc_hbd_chroma_iquant_itrans_recon_16x16;
+
+ihevc_chroma_iquant_itrans_recon_4x4_ft ihevc_chroma_iquant_itrans_recon_4x4_sse42;
+ihevc_hbd_chroma_iquant_itrans_recon_4x4_ft ihevc_hbd_chroma_iquant_itrans_recon_4x4_sse42;
+ihevc_chroma_iquant_itrans_recon_8x8_ft ihevc_chroma_iquant_itrans_recon_8x8_sse42;
+ihevc_hbd_chroma_iquant_itrans_recon_8x8_ft ihevc_hbd_chroma_iquant_itrans_recon_8x8_sse42;
+ihevc_chroma_iquant_itrans_recon_16x16_ft ihevc_chroma_iquant_itrans_recon_16x16_sse42;
+ihevc_hbd_chroma_iquant_itrans_recon_16x16_ft ihevc_hbd_chroma_iquant_itrans_recon_16x16_sse42;
+
+#endif /*_IHEVC_CHROMA_IQUANT_ITRANS_RECON_H_*/
diff --git a/common/ihevc_chroma_iquant_recon.c b/common/ihevc_chroma_iquant_recon.c
new file mode 100644
index 0000000..cba9eb1
--- /dev/null
+++ b/common/ihevc_chroma_iquant_recon.c
@@ -0,0 +1,398 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_chroma_iquant_recon.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization and
+ * reconstruction of chroma interleaved data.
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_chroma_iquant_recon_4x4()
+ * - ihevc_chroma_iquant_recon_8x8()
+ * - ihevc_chroma_iquant_recon_16x16()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_iquant_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization and reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 4x4
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_recon_4x4(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 2;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_4;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT_4x4(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred += 2;
+ pu1_dst += 2;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 8x8
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_recon_8x8(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 3;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_8;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred += 2;
+ pu1_dst += 2;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization and reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 16x16
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_iquant_recon_16x16(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 4;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_16;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred += 2;
+ pu1_dst += 2;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
+
diff --git a/common/ihevc_chroma_iquant_recon.h b/common/ihevc_chroma_iquant_recon.h
new file mode 100644
index 0000000..8f6a043
--- /dev/null
+++ b/common/ihevc_chroma_iquant_recon.h
@@ -0,0 +1,111 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_iquant_recon.h
+*
+* @brief
+* Functions declarations for inverse quantization and reconstruction of
+* chroma interleaved data.
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_IQUANT_RECON_H_
+#define _IHEVC_CHROMA_IQUANT_RECON_H_
+
+typedef void ihevc_chroma_iquant_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_iquant_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_chroma_iquant_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_iquant_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_chroma_iquant_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_iquant_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+
+ihevc_chroma_iquant_recon_4x4_ft ihevc_chroma_iquant_recon_4x4;
+ihevc_hbd_chroma_iquant_recon_4x4_ft ihevc_hbd_chroma_iquant_recon_4x4;
+ihevc_chroma_iquant_recon_8x8_ft ihevc_chroma_iquant_recon_8x8;
+ihevc_hbd_chroma_iquant_recon_8x8_ft ihevc_hbd_chroma_iquant_recon_8x8;
+ihevc_chroma_iquant_recon_16x16_ft ihevc_chroma_iquant_recon_16x16;
+ihevc_hbd_chroma_iquant_recon_16x16_ft ihevc_hbd_chroma_iquant_recon_16x16;
+
+#endif /*_IHEVC_CHROMA_IQUANT_RECON_H_*/
diff --git a/common/ihevc_chroma_itrans_recon.c b/common/ihevc_chroma_itrans_recon.c
new file mode 100644
index 0000000..bbbc476
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon.c
@@ -0,0 +1,205 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_chroma_itrans_recon.c
+ *
+ * @brief
+ * Contains function definitions for inverse transform and reconstruction
+ * of chroma interleaved data.
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_chroma_itrans_recon_4x4()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_itrans_recon_4x4(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 j;
+ WORD32 e[2], o[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ UNUSED(zero_rows);
+ trans_size = TRANS_SIZE_4;
+
+ pi2_tmp_orig = pi2_tmp;
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
+ o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
+ e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
+ e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
+
+ pi2_tmp[0] =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pi2_tmp[1] =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pi2_tmp[2] =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pi2_tmp[3] =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 itrans_out;
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][0] * pi2_tmp[3 * trans_size];
+ o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][1] * pi2_tmp[3 * trans_size];
+ e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][0] * pi2_tmp[2 * trans_size];
+ e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][1] * pi2_tmp[2 * trans_size];
+
+ itrans_out =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pu1_dst[0 * 2] = CLIP_U8((itrans_out + pu1_pred[0 * 2]));
+ itrans_out =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pu1_dst[1 * 2] = CLIP_U8((itrans_out + pu1_pred[1 * 2]));
+ itrans_out =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pu1_dst[2 * 2] = CLIP_U8((itrans_out + pu1_pred[2 * 2]));
+ itrans_out =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+ pu1_dst[3 * 2] = CLIP_U8((itrans_out + pu1_pred[3 * 2]));
+
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+
+ }
+}
+
diff --git a/common/ihevc_chroma_itrans_recon.h b/common/ihevc_chroma_itrans_recon.h
new file mode 100644
index 0000000..c20cebf
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_itrans_recon.h
+*
+* @brief
+* Functions declarations for inverse transform and reconstruction of
+* chroma interleaved data.
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_ITRANS_RECON_H_
+#define _IHEVC_CHROMA_ITRANS_RECON_H_
+
+typedef void ihevc_chroma_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_chroma_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+typedef void ihevc_chroma_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_chroma_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+typedef void ihevc_chroma_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_chroma_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+ihevc_chroma_itrans_recon_4x4_ft ihevc_chroma_itrans_recon_4x4;
+ihevc_hbd_chroma_itrans_recon_4x4_ft ihevc_hbd_chroma_itrans_recon_4x4;
+ihevc_chroma_itrans_recon_8x8_ft ihevc_chroma_itrans_recon_8x8;
+ihevc_hbd_chroma_itrans_recon_8x8_ft ihevc_hbd_chroma_itrans_recon_8x8;
+ihevc_chroma_itrans_recon_16x16_ft ihevc_chroma_itrans_recon_16x16;
+ihevc_hbd_chroma_itrans_recon_16x16_ft ihevc_hbd_chroma_itrans_recon_16x16;
+
+ihevc_hbd_chroma_itrans_recon_4x4_ft ihevc_hbd_chroma_itrans_recon_4x4_sse42;
+ihevc_hbd_chroma_itrans_recon_8x8_ft ihevc_hbd_chroma_itrans_recon_8x8_sse42;
+ihevc_hbd_chroma_itrans_recon_16x16_ft ihevc_hbd_chroma_itrans_recon_16x16_sse42;
+
+#endif /*_IHEVC_CHROMA_ITRANS_RECON_H_*/
diff --git a/common/ihevc_chroma_itrans_recon_16x16.c b/common/ihevc_chroma_itrans_recon_16x16.c
new file mode 100644
index 0000000..35874fe
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon_16x16.c
@@ -0,0 +1,895 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_chroma_itrans_recon_16x16.c
+ *
+ * @brief
+ * Contains function definitions for 16x16 inverse transform and reconstruction
+ * of chroma interleaved data.
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_chroma_itrans_recon_16x16()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 16x16 buffer for storing inverse transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_itrans_recon_16x16(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 j, k;
+ WORD32 e[8], o[8];
+ WORD32 ee[4], eo[4];
+ WORD32 eee[2], eeo[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ WORD32 row_limit_2nd_stage, zero_rows_2nd_stage = zero_cols;
+
+ trans_size = TRANS_SIZE_16;
+ pi2_tmp_orig = pi2_tmp;
+
+ if((zero_cols & 0xFFF0) == 0xFFF0)
+ row_limit_2nd_stage = 4;
+ else if((zero_cols & 0xFF00) == 0xFF00)
+ row_limit_2nd_stage = 8;
+ else
+ row_limit_2nd_stage = TRANS_SIZE_16;
+
+ if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_src[3 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_tmp[15 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_tmp[14 * trans_size];
+ }
+ eeo[0] =
+ g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_tmp[12
+ * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+ eeo[1] =
+ g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_tmp[12
+ * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+ }
+ else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_src[7 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_src[6 * src_strd];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_tmp[15 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_tmp[14 * trans_size];
+ }
+ eeo[0] =
+ g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_tmp[12
+ * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+ eeo[1] =
+ g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_tmp[12
+ * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+ }
+ else /* All rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_src[7 * src_strd]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_src[9 * src_strd]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_src[11 * src_strd]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_src[13 * src_strd]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_src[15 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_src[6 * src_strd]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_src[10 * src_strd]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_src[14 * src_strd];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_src[12 * src_strd];
+ eee[0] =
+ g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_16[8][0]
+ * pi2_src[8
+ * src_strd];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_src[12 * src_strd];
+ eee[1] =
+ g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_16[8][1]
+ * pi2_src[8
+ * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_tmp[15 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_tmp[14 * trans_size];
+ }
+ eeo[0] =
+ g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_tmp[12
+ * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+ eeo[1] =
+ g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_tmp[12
+ * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+ }
+}
+
diff --git a/common/ihevc_chroma_itrans_recon_8x8.c b/common/ihevc_chroma_itrans_recon_8x8.c
new file mode 100644
index 0000000..f086387
--- /dev/null
+++ b/common/ihevc_chroma_itrans_recon_8x8.c
@@ -0,0 +1,285 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_chroma_itrans_recon_8x8.c
+ *
+ * @brief
+ * Contains function definitions for 8x8 inverse transform and reconstruction
+ * of chroma interleaved data.
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_chroma_itrans_recon_8x8()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 8x8 buffer for storing inverse transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_itrans_recon_8x8(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 j, k;
+ WORD32 e[4], o[4];
+ WORD32 ee[2], eo[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ WORD32 zero_rows_2nd_stage = zero_cols;
+ WORD32 row_limit_2nd_stage;
+ UNUSED(zero_rows);
+ trans_size = TRANS_SIZE_8;
+
+ pi2_tmp_orig = pi2_tmp;
+
+ if((zero_cols & 0xF0) == 0xF0)
+ row_limit_2nd_stage = 4;
+ else
+ row_limit_2nd_stage = TRANS_SIZE_8;
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_8[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_8[7][k]
+ * pi2_src[7 * src_strd];
+ }
+
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 4] =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ pu1_dst[(k + 4) * 2] =
+ CLIP_U8((itrans_out + pu1_pred[(k + 4) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_8[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_8[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
+ itrans_out =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ pu1_dst[(k + 4) * 2] =
+ CLIP_U8((itrans_out + pu1_pred[(k + 4) * 2]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+ }
+}
diff --git a/common/ihevc_chroma_recon.c b/common/ihevc_chroma_recon.c
new file mode 100644
index 0000000..4a1e9ee
--- /dev/null
+++ b/common/ihevc_chroma_recon.c
@@ -0,0 +1,308 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_chroma_recon.c
+ *
+ * @brief
+ * Functions definitions reconstruction of chroma interleaved data.
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_chroma_recon_4x4()
+ * - ihevc_chroma_recon_8x8()
+ * - ihevc_chroma_recon_16x16()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_chroma_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
+/* Data visualization */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* U V U V U V U V */
+/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
+/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 4x4 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_recon_4x4(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_4;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst += 2;
+ pu1_pred += 2;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 8x8 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 8x8 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_recon_8x8(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_8;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst += 2;
+ pu1_pred += 2;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 16x16 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 16x16 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_recon_16x16(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_16;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst += 2;
+ pu1_pred += 2;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
diff --git a/common/ihevc_chroma_recon.h b/common/ihevc_chroma_recon.h
new file mode 100644
index 0000000..b4ece06
--- /dev/null
+++ b/common/ihevc_chroma_recon.h
@@ -0,0 +1,99 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_recon.h
+*
+* @brief
+* Functions declarations reconstruction of chroma interleaved data.
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ihevc_chroma_recon_4x4_ttype1()
+* - ihevc_chroma_recon_4x4()
+* - ihevc_chroma_recon_8x8()
+* - ihevc_chroma_recon_16x16()
+* - ihevc_chroma_recon_32x32()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_CHROMA_RECON_H_
+#define _IHEVC_CHROMA_RECON_H_
+
+typedef void ihevc_chroma_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_chroma_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_chroma_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_chroma_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+
+ihevc_chroma_recon_4x4_ft ihevc_chroma_recon_4x4;
+ihevc_hbd_chroma_recon_4x4_ft ihevc_hbd_chroma_recon_4x4;
+ihevc_chroma_recon_8x8_ft ihevc_chroma_recon_8x8;
+ihevc_hbd_chroma_recon_8x8_ft ihevc_hbd_chroma_recon_8x8;
+ihevc_chroma_recon_16x16_ft ihevc_chroma_recon_16x16;
+ihevc_hbd_chroma_recon_16x16_ft ihevc_hbd_chroma_recon_16x16;
+
+#endif /*_IHEVC_CHROMA_RECON_H_*/
diff --git a/common/ihevc_common_tables.c b/common/ihevc_common_tables.c
new file mode 100644
index 0000000..7927497
--- /dev/null
+++ b/common/ihevc_common_tables.c
@@ -0,0 +1,549 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_common_tables.c
+*
+* @brief
+* Contains common global tables
+*
+* @author
+* Harish M
+*
+* @par List of Tables:
+* gai4_ihevc_max_luma_pic_size
+* gai4_ihevc_max_wd_ht
+* gai4_ihevc_min_wd_ht
+* gai4_ihevc_ang_table
+* col_for_intra_luma
+* col_for_intra_chroma
+* idx_neg_vals_3_9
+* idx_neg_idx_3_9
+* idx_neg_idx_chroma_3_9
+* idx_neg_idx_11_17
+* idx_neg_idx_chroma_11_17
+* gai4_ihevc_inv_ang_table
+* gau1_ihevc_invscan8x8
+* gau1_ihevc_invscan4x4
+* gau1_ihevc_invscan2x2
+* gau1_ihevc_scan8x8
+* gau1_ihevc_scan4x4
+* gau1_ihevc_scan2x2
+* *gapv_ihevc_scan
+* *gapv_ihevc_invscan
+* gau1_ihevc_chroma_qp_scale
+* gai1_ihevc_chroma_qp_scale
+* gau1_ihevc_planar_factor
+* gau1_ihevc_planar_factor_1
+* gai4_ihevc_ang_table_chroma
+* gai4_ihevc_inv_ang_table_chroma
+* gau1_ihevc_planar_factor_chroma
+* gau1_intra_pred_ref_filter
+* gi1_table_edge_idx
+* gu1_table_band_idx
+* gu2_table_band_idx
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+
+/*****************************************************************************/
+/* Level specific tables */
+/*****************************************************************************/
+
+/**
+ * Array giving size of max luma samples in a picture for a given level
+ */
+const WORD32 gai4_ihevc_max_luma_pic_size[] =
+{
+ /* Level 1 */
+ 36864,
+ /* Level 2 */
+ 122880,
+ /* Level 2.1 */
+ 245760,
+ /* Level 3 */
+ 552960,
+ /* Level 3.1 */
+ 983040,
+ /* Level 4 */
+ 2228224,
+ /* Level 4.1 */
+ 2228224,
+ /* Level 5 */
+ 8912896,
+ /* Level 5.1 */
+ 8912896,
+ /* Level 5.2 */
+ 8912896,
+ /* Level 6 */
+ 33423360,
+ /* Level 6.1 */
+ 33423360,
+ /* Level 6.2 */
+ 33423360
+};
+/** Max width and height allowed for a given level */
+/** This is derived as SQRT(8 * gai4_ihevc_max_luma_pic_size[]) */
+const WORD32 gai4_ihevc_max_wd_ht[] =
+{
+ /* Level 1 */
+ 543,
+ /* Level 2 */
+ 991,
+ /* Level 2.1 */
+ 1402,
+ /* Level 3 */
+ 2103,
+ /* Level 3.1 */
+ 2804,
+ /* Level 4 */
+ 4222,
+ /* Level 4.1 */
+ 4222,
+ /* Level 5 */
+ 8444,
+ /* Level 5.1 */
+ 8444,
+ /* Level 5.2 */
+ 8444,
+ /* Level 6 */
+ 16888,
+ /* Level 6.1 */
+ 16888,
+ /* Level 6.2 */
+ 16888
+};
+
+/** Min width and height allowed for a given level */
+/** This is derived as gai4_ihevc_max_luma_pic_size[]/gai4_ihevc_max_wd_ht[] */
+const WORD32 gai4_ihevc_min_wd_ht[] =
+{
+ /* Level 1 */
+ 67,
+ /* Level 2 */
+ 123,
+ /* Level 2.1 */
+ 175,
+ /* Level 3 */
+ 262,
+ /* Level 3.1 */
+ 350,
+ /* Level 4 */
+ 527,
+ /* Level 4.1 */
+ 527,
+ /* Level 5 */
+ 1055,
+ /* Level 5.1 */
+ 1055,
+ /* Level 5.2 */
+ 1055,
+ /* Level 6 */
+ 2111,
+ /* Level 6.1 */
+ 2111,
+ /* Level 6.2 */
+ 2111
+};
+/*****************************************************************************/
+/* Intra prediction tables */
+/*****************************************************************************/
+/**
+ * Intra pred angles
+ */
+/* g_ang_table = tan(actual angle) in Q5 format for all 33 modes */
+const WORD32 gai4_ihevc_ang_table[35] =
+ { 0, 0, 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+
+const WORD8 col_for_intra_luma[32] =
+ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31, 32 };
+
+const WORD8 col_for_intra_chroma[32] =
+ { 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16 };
+
+const WORD8 idx_neg_vals_3_9[7] =
+ { 26, 21, 17, 13, 9, 5, 2 };
+
+const WORD32 idx_neg_idx_3_9[28] =
+ { 6, 13, 19, 26, 5, 10, 15, 21, 4, 8, 12, 17, 3, 6, 9, 13, 2, 4, 6, 9,
+ 1, 2, 3, 5, 0, 0, 1, 2 };
+
+
+const WORD32 idx_neg_idx_chroma_3_9[28] =
+ { 3, 6, 9, 13,
+ 2, 5, 7, 10,
+ 2, 4, 6, 8,
+ 1, 3, 4, 6,
+ 1, 2, 3, 4,
+ 0, 1, 1, 2,
+ 0, 0, 0, 1 };
+const WORD32 idx_neg_idx_11_17[28] =
+ { -1, -1, -2, -2, -2, -3, -4, -5, -3, -5, -7, -9, -4, -7, -10, -13, -5, -9, -13, -17, -6, -11,
+ -16, -21, -7, -13, -20, -26 };
+
+const WORD32 idx_neg_idx_chroma_11_17[28] =
+ { -1, -1, -1, -1,
+ -1, -2, -2, -3,
+ -2, -3, -4, -5,
+ -2, -4, -5, -7,
+ -3, -5, -7, -9,
+ -3, -6, -8, -11,
+ -4, -7, -10, -13 };
+
+/**
+ * Intra pred inverse angles
+ */
+/* g_invAngTable = Inverse angle in Q5 format, required for negative angles */
+const WORD32 gai4_ihevc_inv_ang_table[14] =
+ { 4096, 1638, 910, 630, 482, 390, 315, 315, 390, 482, 630, 910, 1638, 4096 };
+
+/*****************************************************************************/
+/* Scan matrices */
+/*****************************************************************************/
+/**
+ * Inverse Scan matrix for 8x8 Section 6.5.3
+ */
+const UWORD8 gau1_ihevc_invscan8x8[][64] =
+{
+ /* Upright diagonal */
+ {
+ 0, 8, 1, 16, 9, 2, 24, 17,
+ 10, 3, 32, 25, 18, 11, 4, 40,
+ 33, 26, 19, 12, 5, 48, 41, 34,
+ 27, 20, 13, 6, 56, 49, 42, 35,
+ 28, 21, 14, 7, 57, 50, 43, 36,
+ 29, 22, 15, 58, 51, 44, 37, 30,
+ 23, 59, 52, 45, 38, 31, 60, 53,
+ 46, 39, 61, 54, 47, 62, 55, 63
+ },
+ /* Horizontal */
+ {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63
+ },
+ /* Vertical */
+ {
+ 0, 8, 16, 24, 32, 40, 48, 56,
+ 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58,
+ 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60,
+ 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62,
+ 7, 15, 23, 31, 39, 47, 55, 63
+ }
+};
+
+/**
+ * Inverse Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8 gau1_ihevc_invscan4x4[][16] =
+{
+ /* Upright diagonal */
+ {
+ 0, 4, 1, 8,
+ 5, 2, 12, 9,
+ 6, 3, 13, 10,
+ 7, 14, 11, 15
+ },
+ /* Horizontal */
+ {
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 8, 9, 10, 11,
+ 12, 13, 14, 15
+ },
+ /* Vertical */
+ {
+ 0, 4, 8, 12,
+ 1, 5, 9, 13,
+ 2, 6, 10, 14,
+ 3, 7, 11, 15
+ }
+};
+
+/**
+ * Inverse Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8 gau1_ihevc_invscan2x2[][4] =
+{
+ /* Upright diagonal */
+ {
+ 0, 2,
+ 1, 3
+ },
+ /* Horizontal */
+ {
+ 0, 1,
+ 2, 3
+ },
+ /* Vertical */
+ {
+ 0, 2,
+ 1, 3,
+ }
+};
+
+/**
+ * Scan matrix for 8x8 Section 6.5.3
+ */
+
+const UWORD8 gau1_ihevc_scan8x8[][64] =
+{
+ /* Upright diagonal */
+ {
+ 0, 2, 5, 9, 14, 20, 27, 35,
+ 1, 4, 8, 13, 19, 26, 34, 42,
+ 3, 7, 12, 18, 25, 33, 41, 48,
+ 6, 11, 17, 24, 32, 40, 47, 53,
+ 10, 16, 23, 31, 39, 46, 52, 57,
+ 15, 22, 30, 38, 45, 51, 56, 60,
+ 21, 29, 37, 44, 50, 55, 59, 62,
+ 28, 36, 43, 49, 54, 58, 61, 63
+ },
+ /* Horizontal */
+ {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63
+ },
+ /* Vertical */
+ {
+ 0, 8, 16, 24, 32, 40, 48, 56,
+ 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58,
+ 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60,
+ 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62,
+ 7, 15, 23, 31, 39, 47, 55, 63
+ }
+};
+
+/**
+ * Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8 gau1_ihevc_scan4x4[][16] =
+{
+ /* Upright diagonal */
+ {
+ 0, 2, 5, 9,
+ 1, 4, 8, 12,
+ 3, 7, 11, 14,
+ 6, 10, 13, 15
+ },
+ /* Horizontal */
+ {
+ 0, 1, 2, 3,
+ 4, 5, 6, 7,
+ 8, 9, 10, 11,
+ 12, 13, 14, 15
+ },
+ /* Vertical */
+ {
+ 0, 4, 8, 12,
+ 1, 5, 9, 13,
+ 2, 6, 10, 14,
+ 3, 7, 11, 15
+ }
+};
+
+/**
+ * Scan matrix for 4x4 Section 6.5.3
+ */
+const UWORD8 gau1_ihevc_scan2x2[][4] =
+{
+ /* Upright diagonal */
+ {
+ 0, 2,
+ 1, 3
+ },
+ /* Horizontal */
+ {
+ 0, 1,
+ 2, 3
+ },
+ /* Vertical */
+ {
+ 0, 2,
+ 1, 3,
+ }
+};
+
+/**
+ * Table containing all the scan matrices
+ */
+const void *gapv_ihevc_scan[] =
+{
+ gau1_ihevc_scan2x2[0],
+ gau1_ihevc_scan4x4[0],
+ gau1_ihevc_scan8x8[0],
+
+ gau1_ihevc_scan2x2[1],
+ gau1_ihevc_scan4x4[1],
+ gau1_ihevc_scan8x8[1],
+
+ gau1_ihevc_scan2x2[2],
+ gau1_ihevc_scan4x4[2],
+ gau1_ihevc_scan8x8[2],
+
+};
+
+const void *gapv_ihevc_invscan[] =
+{
+ gau1_ihevc_invscan2x2[0],
+ gau1_ihevc_invscan4x4[0],
+ gau1_ihevc_invscan8x8[0],
+
+ gau1_ihevc_invscan2x2[1],
+ gau1_ihevc_invscan4x4[1],
+ gau1_ihevc_invscan8x8[1],
+
+ gau1_ihevc_invscan2x2[2],
+ gau1_ihevc_invscan4x4[2],
+ gau1_ihevc_invscan8x8[2],
+};
+/**
+ * Table for luma to chroma qp conversion
+ */
+
+// FOR MAIN branch encoder ( 8 bit)
+const UWORD8 gau1_ihevc_chroma_qp_scale[58] =
+{
+
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32,
+ 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51
+};
+
+// FOR HBD branch encoder ( 8 and 10 bit)
+const WORD8 gai1_ihevc_chroma_qp_scale[70] = //EXTENDED for 10 bit
+{
+
+ -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32,
+ 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51
+};
+
+
+/** constant planar factor values table */
+const UWORD8 gau1_ihevc_planar_factor[65] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+ 61, 62, 63, 64 };
+//AX CHANGES
+const UWORD8 gau1_ihevc_planar_factor_1[32] = { 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9,
+ 17, 17, 17, 17, 17, 17, 17, 17, 25, 25, 25, 25, 25, 25,
+ 25, 25 };
+//AX CHANGES
+
+/** g_ang_table = tan(actual angle) in Q5 format for all 33 modes */
+const WORD32 gai4_ihevc_ang_table_chroma[35] = { 0, 0, 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9,
+ -13, -17, -21, -26, -32, -26, -21, -17, -13, -9, -5,
+ -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+/** g_invAngTable = Inverse angle in Q5 format, required for negative angles */
+const WORD32 gai4_ihevc_inv_ang_table_chroma[14] = { 4096, 1638, 910, 630, 482, 390, 315,
+ 315, 390, 482, 630, 910, 1638, 4096 };
+
+
+/** constant planar factor values table */
+const UWORD8 gau1_ihevc_planar_factor_chroma[33] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32 };
+
+
+
+/** Filter flag values for intra pred referenece filtering - intra pred mode is the index
+* flag for nt = 4 is the Bit 0, nt = 8 is Bit 1, nt = 16 is Bit 2, nt = 32 is Bit 3
+*/
+const UWORD8 gau1_intra_pred_ref_filter[] =
+{
+ 14, 0, 14, 12, 12, 12, 12,
+ 12, 12, 8, 0, 8, 12, 12,
+ 12, 12, 12, 12, 14, 12, 12,
+ 12, 12, 12, 12, 8, 0, 8,
+ 12, 12, 12, 12, 12, 12, 14
+};
+
+
+const WORD8 gi1_table_edge_idx[8] = { 1, 2, 0, 3, 4, 0, 0, 0 }; /* First 5 values are valid. Last 3 dummy values are added to help SIMD load*/
+
+const UWORD8 gu1_table_band_idx[32] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31
+};
+
+const UWORD16 gu2_table_band_idx[32] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31
+};
+
+#ifdef ENABLE_SSE4_1_INTR
+/*Used as a lookup table to have popcnt instruction working for SSE4.1 platform.
+Each unit indicates number of 1s the index at which it is persent
+*/
+const WORD8 gi1_popcnt_byte_table[] =
+{
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+#endif
diff --git a/common/ihevc_common_tables.h b/common/ihevc_common_tables.h
new file mode 100644
index 0000000..ff7e438
--- /dev/null
+++ b/common/ihevc_common_tables.h
@@ -0,0 +1,75 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_common_tables.h
+*
+* @brief
+* Common tables
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_COMMON_TABLES_H_
+#define _IHEVC_COMMON_TABLES_H_
+
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_max_luma_pic_size[];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_max_wd_ht[];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_min_wd_ht[];
+
+
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_ang_table[35];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_inv_ang_table[14];
+
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_scan8x8[][64];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_scan4x4[][16];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_scan2x2[][4];
+
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_invscan8x8[][64];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_invscan4x4[][16];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_invscan2x2[][4];
+
+extern MEM_ALIGN16 const void *gapv_ihevc_scan[];
+extern MEM_ALIGN16 const void *gapv_ihevc_invscan[];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_chroma_qp_scale[];
+extern MEM_ALIGN16 const WORD8 gai1_ihevc_chroma_qp_scale[];
+
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_ang_table_chroma[35];
+extern MEM_ALIGN16 const WORD32 gai4_ihevc_inv_ang_table_chroma[14];
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_planar_factor_chroma[33];
+
+extern MEM_ALIGN16 const UWORD8 gau1_ihevc_planar_factor[65];
+
+extern MEM_ALIGN16 const UWORD8 gau1_intra_pred_ref_filter[];
+
+extern MEM_ALIGN16 const WORD8 gi1_table_edge_idx[8];
+
+extern MEM_ALIGN16 const UWORD8 gu1_table_band_idx[32];
+
+extern MEM_ALIGN16 const UWORD16 gu2_table_band_idx[32];
+
+#endif /*_IHEVC_COMMON_TABLES_H_*/
diff --git a/common/ihevc_deblk.h b/common/ihevc_deblk.h
new file mode 100644
index 0000000..cd4c8c8
--- /dev/null
+++ b/common/ihevc_deblk.h
@@ -0,0 +1,173 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_deblk.h
+*
+* @brief
+* Declarations for the fucntions defined in ihevc_deblk.c
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEBLK_H_
+#define _IHEVC_DEBLK_H_
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+
+typedef void ihevc_deblk_luma_vert_ft(
+ UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q);
+
+typedef void ihevc_deblk_luma_horz_ft(
+ UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q);
+
+typedef void ihevc_deblk_chroma_vert_ft(
+ UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q);
+
+typedef void ihevc_deblk_chroma_horz_ft(
+ UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q);
+
+typedef void ihevc_hbd_deblk_luma_vert_ft(
+ UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_deblk_luma_horz_ft(
+ UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_deblk_chroma_vert_ft(
+ UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_deblk_chroma_horz_ft(
+ UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth);
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_a9q;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_a9q;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_a9q;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_a9q;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_a9a;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_a9a;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_a9a;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_a9a;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_neonintr;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_neonintr;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_neonintr;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_neonintr;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_ssse3;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_ssse3;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_ssse3;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_ssse3;
+
+ihevc_hbd_deblk_luma_vert_ft ihevc_hbd_deblk_luma_vert;
+ihevc_hbd_deblk_luma_horz_ft ihevc_hbd_deblk_luma_horz;
+ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_chroma_vert;
+ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_chroma_horz;
+
+ihevc_hbd_deblk_luma_vert_ft ihevc_hbd_deblk_luma_vert_sse42;
+ihevc_hbd_deblk_luma_horz_ft ihevc_hbd_deblk_luma_horz_sse42;
+ihevc_hbd_deblk_chroma_vert_ft ihevc_hbd_deblk_chroma_vert_sse42;
+ihevc_hbd_deblk_chroma_horz_ft ihevc_hbd_deblk_chroma_horz_sse42;
+
+ihevc_deblk_luma_vert_ft ihevc_deblk_luma_vert_av8;
+ihevc_deblk_luma_horz_ft ihevc_deblk_luma_horz_av8;
+ihevc_deblk_chroma_vert_ft ihevc_deblk_chroma_vert_av8;
+ihevc_deblk_chroma_horz_ft ihevc_deblk_chroma_horz_av8;
+
+#endif /*_IHEVC_DEBLK_H_*/
diff --git a/common/ihevc_deblk_edge_filter.c b/common/ihevc_deblk_edge_filter.c
new file mode 100644
index 0000000..8b6e6ea
--- /dev/null
+++ b/common/ihevc_deblk_edge_filter.c
@@ -0,0 +1,1510 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_deblk_edge_filter.c
+*
+* @brief
+* Contains function definitions for deblocking filters
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_deblk_luma_vert()
+* - ihevc_deblk_luma_horz()
+* - ihevc_deblk_chroma_vert()
+* - ihevc_deblk_chroma_horz()
+* - ihevc_hbd_deblk_luma_vert()
+* - ihevc_hbd_deblk_luma_horz()
+* - ihevc_hbd_deblk_chroma_vert()
+* - ihevc_hbd_deblk_chroma_horz()
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevc_debug.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Decision process and filtering for the luma block vertical edge.
+*
+* @par Description:
+* The decision process for the luma block vertical edge is carried out and
+* an appropriate filter is applied. The boundary filter strength, bs should
+* be greater than 0. The pcm flags and the transquant bypass flags should
+* be taken care of by the calling function.
+*
+* @param[in] pu1_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_luma_vert(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_luma, beta_indx, tc_indx;
+ WORD32 beta, tc;
+ WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+ WORD32 d_sam0, d_sam3;
+ WORD32 de, dep, deq;
+ WORD32 row;
+ WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+ WORD32 delta, delta_p, delta_q;
+
+ ASSERT((bs > 0) && (bs <= 3));
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+ beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+ /* BS based on implementation can take value 3 if it is intra/inter egde */
+ /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+ /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
+ /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
+
+ tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
+
+ beta = gai4_ihevc_beta_table[beta_indx];
+ tc = gai4_ihevc_tc_table[tc_indx];
+ if(0 == tc)
+ {
+ return;
+ }
+
+ dq0 = ABS(pu1_src[2] - 2 * pu1_src[1] + pu1_src[0]);
+ dq3 = ABS(pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]
+ + pu1_src[3 * src_strd + 0]);
+ dp0 = ABS(pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1]);
+ dp3 = ABS(pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]
+ + pu1_src[3 * src_strd - 1]);
+
+ d0 = dp0 + dq0;
+ d3 = dp3 + dq3;
+
+ dp = dp0 + dp3;
+ dq = dq0 + dq3;
+
+ d = d0 + d3;
+
+ de = 0;
+ dep = 0;
+ deq = 0;
+
+ if(d < beta)
+ {
+ d_sam0 = 0;
+ if((2 * d0 < (beta >> 2))
+ && (ABS(pu1_src[3] - pu1_src[0]) + ABS(pu1_src[-1] - pu1_src[-4])
+ < (beta >> 3))
+ && ABS(pu1_src[0] - pu1_src[-1]) < ((5 * tc + 1) >> 1))
+ {
+ d_sam0 = 1;
+ }
+
+ pu1_src += 3 * src_strd;
+ d_sam3 = 0;
+ if((2 * d3 < (beta >> 2))
+ && (ABS(pu1_src[3] - pu1_src[0]) + ABS(pu1_src[-1] - pu1_src[-4])
+ < (beta >> 3))
+ && ABS(pu1_src[0] - pu1_src[-1]) < ((5 * tc + 1) >> 1))
+ {
+ d_sam3 = 1;
+ }
+ pu1_src -= 3 * src_strd;
+
+ de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+ dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ if(tc <= 1)
+ {
+ dep = 0;
+ deq = 0;
+ }
+ }
+
+ if(de != 0)
+ {
+ for(row = 0; row < 4; row++)
+ {
+ tmp_p0 = pu1_src[-1];
+ tmp_p1 = pu1_src[-2];
+ tmp_p2 = pu1_src[-3];
+
+ tmp_q0 = pu1_src[0];
+ tmp_q1 = pu1_src[1];
+ tmp_q2 = pu1_src[2];
+
+ if(de == 2)
+ {
+ tmp_q0 = CLIP3((pu1_src[2] + 2 * pu1_src[1] +
+ 2 * pu1_src[0] + 2 * pu1_src[-1] +
+ pu1_src[-2] + 4) >> 3,
+ pu1_src[0] - 2 * tc,
+ pu1_src[0] + 2 * tc);
+
+ tmp_q1 = CLIP3((pu1_src[2] + pu1_src[1] + pu1_src[0] +
+ pu1_src[-1] + 2) >> 2,
+ pu1_src[1] - 2 * tc,
+ pu1_src[1] + 2 * tc);
+
+ tmp_q2 = CLIP3((2 * pu1_src[3] + 3 * pu1_src[2] +
+ pu1_src[1] + pu1_src[0] +
+ pu1_src[-1] + 4) >> 3,
+ pu1_src[2] - 2 * tc,
+ pu1_src[2] + 2 * tc);
+
+ tmp_p0 = CLIP3((pu1_src[1] + 2 * pu1_src[0] +
+ 2 * pu1_src[-1] + 2 * pu1_src[-2] +
+ pu1_src[-3] + 4) >> 3,
+ pu1_src[-1] - 2 * tc,
+ pu1_src[-1] + 2 * tc);
+
+ tmp_p1 = CLIP3((pu1_src[0] + pu1_src[-1] +
+ pu1_src[-2] + pu1_src[-3] + 2) >> 2,
+ pu1_src[-2] - 2 * tc,
+ pu1_src[-2] + 2 * tc);
+
+ tmp_p2 = CLIP3((pu1_src[0] + pu1_src[-1] +
+ pu1_src[-2] + 3 * pu1_src[-3] +
+ 2 * pu1_src[-4] + 4) >> 3,
+ pu1_src[-3] - 2 * tc,
+ pu1_src[-3] + 2 * tc);
+ }
+ else
+ {
+ delta = (9 * (pu1_src[0] - pu1_src[-1]) -
+ 3 * (pu1_src[1] - pu1_src[-2]) + 8) >> 4;
+ if(ABS(delta) < 10 * tc)
+ {
+ delta = CLIP3(delta, -tc, tc);
+
+ tmp_p0 = CLIP_U8(pu1_src[-1] + delta);
+ tmp_q0 = CLIP_U8(pu1_src[0] - delta);
+
+ if(dep == 1)
+ {
+ delta_p = CLIP3((((pu1_src[-3] + pu1_src[-1] + 1) >> 1)
+ - pu1_src[-2] + delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_p1 = CLIP_U8(pu1_src[-2] + delta_p);
+ }
+
+ if(deq == 1)
+ {
+ delta_q = CLIP3((((pu1_src[2] + pu1_src[0] + 1) >> 1)
+ - pu1_src[1] - delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_q1 = CLIP_U8(pu1_src[1] + delta_q);
+ }
+ }
+ }
+
+ if(filter_flag_p != 0)
+ {
+ pu1_src[-3] = tmp_p2;
+ pu1_src[-2] = tmp_p1;
+ pu1_src[-1] = tmp_p0;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu1_src[0] = tmp_q0;
+ pu1_src[1] = tmp_q1;
+ pu1_src[2] = tmp_q2;
+ }
+
+ pu1_src += src_strd;
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Decision process and filtering for the luma block vertical edge for high bit depth.
+*
+* @par Description:
+* The decision process for the luma block vertical edge is carried out and
+* an appropriate filter is applied. The boundary filter strength, bs should
+* be greater than 0. The pcm flags and the transquant bypass flags should
+* be taken care of by the calling function.
+*
+* @param[in] pu2_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_luma_vert(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth)
+{
+ WORD32 qp_luma, beta_indx, tc_indx;
+ WORD32 beta, tc;
+ WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+ WORD32 d_sam0, d_sam3;
+ WORD32 de, dep, deq;
+ WORD32 row;
+ WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+ WORD32 delta, delta_p, delta_q;
+
+ ASSERT((bs > 0) && (bs <= 3));
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+ beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+ /* BS based on implementation can take value 3 if it is intra/inter egde */
+ /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+ /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
+ /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
+
+ tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
+
+ beta = gai4_ihevc_beta_table[beta_indx] * (1 << (bit_depth - 8));
+ tc = gai4_ihevc_tc_table[tc_indx] * (1 << (bit_depth - 8));
+ if(0 == tc)
+ {
+ return;
+ }
+
+ dq0 = ABS(pu2_src[2] - 2 * pu2_src[1] + pu2_src[0]);
+ dq3 = ABS(pu2_src[3 * src_strd + 2] - 2 * pu2_src[3 * src_strd + 1]
+ + pu2_src[3 * src_strd + 0]);
+ dp0 = ABS(pu2_src[-3] - 2 * pu2_src[-2] + pu2_src[-1]);
+ dp3 = ABS(pu2_src[3 * src_strd - 3] - 2 * pu2_src[3 * src_strd - 2]
+ + pu2_src[3 * src_strd - 1]);
+
+ d0 = dp0 + dq0;
+ d3 = dp3 + dq3;
+
+ dp = dp0 + dp3;
+ dq = dq0 + dq3;
+
+ d = d0 + d3;
+
+ de = 0;
+ dep = 0;
+ deq = 0;
+
+ if(d < beta)
+ {
+ d_sam0 = 0;
+ if((2 * d0 < (beta >> 2))
+ && (ABS(pu2_src[3] - pu2_src[0]) + ABS(pu2_src[-1] - pu2_src[-4])
+ < (beta >> 3))
+ && ABS(pu2_src[0] - pu2_src[-1]) < ((5 * tc + 1) >> 1))
+ {
+ d_sam0 = 1;
+ }
+
+ pu2_src += 3 * src_strd;
+ d_sam3 = 0;
+ if((2 * d3 < (beta >> 2))
+ && (ABS(pu2_src[3] - pu2_src[0]) + ABS(pu2_src[-1] - pu2_src[-4])
+ < (beta >> 3))
+ && ABS(pu2_src[0] - pu2_src[-1]) < ((5 * tc + 1) >> 1))
+ {
+ d_sam3 = 1;
+ }
+ pu2_src -= 3 * src_strd;
+
+ de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+ dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ if(tc <= 1)
+ {
+ dep = 0;
+ deq = 0;
+ }
+ }
+
+ if(de != 0)
+ {
+ for(row = 0; row < 4; row++)
+ {
+ tmp_p0 = pu2_src[-1];
+ tmp_p1 = pu2_src[-2];
+ tmp_p2 = pu2_src[-3];
+
+ tmp_q0 = pu2_src[0];
+ tmp_q1 = pu2_src[1];
+ tmp_q2 = pu2_src[2];
+
+ if(de == 2)
+ {
+ tmp_q0 = CLIP3((pu2_src[2] + 2 * pu2_src[1] +
+ 2 * pu2_src[0] + 2 * pu2_src[-1] +
+ pu2_src[-2] + 4) >> 3,
+ pu2_src[0] - 2 * tc,
+ pu2_src[0] + 2 * tc);
+
+ tmp_q1 = CLIP3((pu2_src[2] + pu2_src[1] + pu2_src[0] +
+ pu2_src[-1] + 2) >> 2,
+ pu2_src[1] - 2 * tc,
+ pu2_src[1] + 2 * tc);
+
+ tmp_q2 = CLIP3((2 * pu2_src[3] + 3 * pu2_src[2] +
+ pu2_src[1] + pu2_src[0] +
+ pu2_src[-1] + 4) >> 3,
+ pu2_src[2] - 2 * tc,
+ pu2_src[2] + 2 * tc);
+
+ tmp_p0 = CLIP3((pu2_src[1] + 2 * pu2_src[0] +
+ 2 * pu2_src[-1] + 2 * pu2_src[-2] +
+ pu2_src[-3] + 4) >> 3,
+ pu2_src[-1] - 2 * tc,
+ pu2_src[-1] + 2 * tc);
+
+ tmp_p1 = CLIP3((pu2_src[0] + pu2_src[-1] +
+ pu2_src[-2] + pu2_src[-3] + 2) >> 2,
+ pu2_src[-2] - 2 * tc,
+ pu2_src[-2] + 2 * tc);
+
+ tmp_p2 = CLIP3((pu2_src[0] + pu2_src[-1] +
+ pu2_src[-2] + 3 * pu2_src[-3] +
+ 2 * pu2_src[-4] + 4) >> 3,
+ pu2_src[-3] - 2 * tc,
+ pu2_src[-3] + 2 * tc);
+ }
+ else
+ {
+ delta = (9 * (pu2_src[0] - pu2_src[-1]) -
+ 3 * (pu2_src[1] - pu2_src[-2]) + 8) >> 4;
+ if(ABS(delta) < 10 * tc)
+ {
+ delta = CLIP3(delta, -tc, tc);
+
+ tmp_p0 = CLIP3(pu2_src[-1] + delta, 0, ((1 << bit_depth) - 1));
+ tmp_q0 = CLIP3(pu2_src[0] - delta, 0, ((1 << bit_depth) - 1));
+ if(dep == 1)
+ {
+ delta_p = CLIP3((((pu2_src[-3] + pu2_src[-1] + 1) >> 1)
+ - pu2_src[-2] + delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_p1 = CLIP3(pu2_src[-2] + delta_p, 0, ((1 << bit_depth) - 1));
+ }
+
+ if(deq == 1)
+ {
+ delta_q = CLIP3((((pu2_src[2] + pu2_src[0] + 1) >> 1)
+ - pu2_src[1] - delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_q1 = CLIP3(pu2_src[1] + delta_q, 0, ((1 << bit_depth) - 1));
+ }
+ }
+ }
+
+ if(filter_flag_p != 0)
+ {
+ pu2_src[-3] = tmp_p2;
+ pu2_src[-2] = tmp_p1;
+ pu2_src[-1] = tmp_p0;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu2_src[0] = tmp_q0;
+ pu2_src[1] = tmp_q1;
+ pu2_src[2] = tmp_q2;
+ }
+
+ pu2_src += src_strd;
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+* Decision process and filtering for the luma block horizontal edge
+*
+* @par Description:
+* The decision process for the luma block horizontal edge is carried out
+* and an appropriate filter is applied. The boundary filter strength, bs
+* should be greater than 0. The pcm flags and the transquant bypass flags
+* should be taken care of by the calling function.
+*
+* @param[in] pu1_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_luma_horz(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_luma, beta_indx, tc_indx;
+ WORD32 beta, tc;
+ WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+ WORD32 d_sam0, d_sam3;
+ WORD32 de, dep, deq;
+ WORD32 col;
+ WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+ WORD32 delta, delta_p, delta_q;
+
+ ASSERT((bs > 0));
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+ beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+ /* BS based on implementation can take value 3 if it is intra/inter egde */
+ /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+ /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
+ /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
+
+ tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
+
+ beta = gai4_ihevc_beta_table[beta_indx];
+ tc = gai4_ihevc_tc_table[tc_indx];
+ if(0 == tc)
+ {
+ return;
+ }
+
+ dq0 = ABS(pu1_src[2 * src_strd] - 2 * pu1_src[1 * src_strd] +
+ pu1_src[0 * src_strd]);
+
+ dq3 = ABS(pu1_src[3 + 2 * src_strd] - 2 * pu1_src[3 + 1 * src_strd] +
+ pu1_src[3 + 0 * src_strd]);
+
+ dp0 = ABS(pu1_src[-3 * src_strd] - 2 * pu1_src[-2 * src_strd] +
+ pu1_src[-1 * src_strd]);
+
+ dp3 = ABS(pu1_src[3 - 3 * src_strd] - 2 * pu1_src[3 - 2 * src_strd] +
+ pu1_src[3 - 1 * src_strd]);
+
+ d0 = dp0 + dq0;
+ d3 = dp3 + dq3;
+
+ dp = dp0 + dp3;
+ dq = dq0 + dq3;
+
+ d = d0 + d3;
+
+ de = 0;
+ dep = 0;
+ deq = 0;
+
+ if(d < beta)
+ {
+ d_sam0 = 0;
+ if((2 * d0 < (beta >> 2))
+ && (ABS(pu1_src[3 * src_strd] - pu1_src[0 * src_strd]) +
+ ABS(pu1_src[-1 * src_strd] - pu1_src[-4 * src_strd])
+ < (beta >> 3))
+ && ABS(pu1_src[0 * src_strd] - pu1_src[-1 * src_strd])
+ < ((5 * tc + 1) >> 1))
+ {
+ d_sam0 = 1;
+ }
+
+ pu1_src += 3;
+ d_sam3 = 0;
+ if((2 * d3 < (beta >> 2))
+ && (ABS(pu1_src[3 * src_strd] - pu1_src[0 * src_strd]) +
+ ABS(pu1_src[-1 * src_strd] - pu1_src[-4 * src_strd])
+ < (beta >> 3))
+ && ABS(pu1_src[0 * src_strd] - pu1_src[-1 * src_strd])
+ < ((5 * tc + 1) >> 1))
+ {
+ d_sam3 = 1;
+ }
+ pu1_src -= 3;
+
+ de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+ dep = (dp < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+ deq = (dq < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+ if(tc <= 1)
+ {
+ dep = 0;
+ deq = 0;
+ }
+ }
+
+ if(de != 0)
+ {
+ for(col = 0; col < 4; col++)
+ {
+ tmp_p0 = pu1_src[-1 * src_strd];
+ tmp_p1 = pu1_src[-2 * src_strd];
+ tmp_p2 = pu1_src[-3 * src_strd];
+
+ tmp_q0 = pu1_src[0 * src_strd];
+ tmp_q1 = pu1_src[1 * src_strd];
+ tmp_q2 = pu1_src[2 * src_strd];
+ if(de == 2)
+ {
+ tmp_q0 = CLIP3((pu1_src[2 * src_strd] +
+ 2 * pu1_src[1 * src_strd] +
+ 2 * pu1_src[0 * src_strd] +
+ 2 * pu1_src[-1 * src_strd] +
+ pu1_src[-2 * src_strd] + 4) >> 3,
+ pu1_src[0 * src_strd] - 2 * tc,
+ pu1_src[0 * src_strd] + 2 * tc);
+
+ tmp_q1 = CLIP3((pu1_src[2 * src_strd] +
+ pu1_src[1 * src_strd] +
+ pu1_src[0 * src_strd] +
+ pu1_src[-1 * src_strd] + 2) >> 2,
+ pu1_src[1 * src_strd] - 2 * tc,
+ pu1_src[1 * src_strd] + 2 * tc);
+
+ tmp_q2 = CLIP3((2 * pu1_src[3 * src_strd] +
+ 3 * pu1_src[2 * src_strd] +
+ pu1_src[1 * src_strd] +
+ pu1_src[0 * src_strd] +
+ pu1_src[-1 * src_strd] + 4) >> 3,
+ pu1_src[2 * src_strd] - 2 * tc,
+ pu1_src[2 * src_strd] + 2 * tc);
+
+ tmp_p0 = CLIP3((pu1_src[1 * src_strd] +
+ 2 * pu1_src[0 * src_strd] +
+ 2 * pu1_src[-1 * src_strd] +
+ 2 * pu1_src[-2 * src_strd] +
+ pu1_src[-3 * src_strd] + 4) >> 3,
+ pu1_src[-1 * src_strd] - 2 * tc,
+ pu1_src[-1 * src_strd] + 2 * tc);
+
+ tmp_p1 = CLIP3((pu1_src[0 * src_strd] +
+ pu1_src[-1 * src_strd] +
+ pu1_src[-2 * src_strd] +
+ pu1_src[-3 * src_strd] + 2) >> 2,
+ pu1_src[-2 * src_strd] - 2 * tc,
+ pu1_src[-2 * src_strd] + 2 * tc);
+
+ tmp_p2 = CLIP3((pu1_src[0 * src_strd] +
+ pu1_src[-1 * src_strd] +
+ pu1_src[-2 * src_strd] +
+ 3 * pu1_src[-3 * src_strd] +
+ 2 * pu1_src[-4 * src_strd] + 4) >> 3,
+ pu1_src[-3 * src_strd] - 2 * tc,
+ pu1_src[-3 * src_strd] + 2 * tc);
+ }
+ else
+ {
+ delta = (9 * (pu1_src[0 * src_strd] - pu1_src[-1 * src_strd]) -
+ 3 * (pu1_src[1 * src_strd] - pu1_src[-2 * src_strd]) +
+ 8) >> 4;
+ if(ABS(delta) < 10 * tc)
+ {
+ delta = CLIP3(delta, -tc, tc);
+
+ tmp_p0 = CLIP_U8(pu1_src[-1 * src_strd] + delta);
+ tmp_q0 = CLIP_U8(pu1_src[0 * src_strd] - delta);
+
+ if(dep == 1)
+ {
+ delta_p = CLIP3((((pu1_src[-3 * src_strd] +
+ pu1_src[-1 * src_strd] + 1) >> 1) -
+ pu1_src[-2 * src_strd] + delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_p1 = CLIP_U8(pu1_src[-2 * src_strd] + delta_p);
+ }
+
+ if(deq == 1)
+ {
+ delta_q = CLIP3((((pu1_src[2 * src_strd] +
+ pu1_src[0 * src_strd] + 1) >> 1) -
+ pu1_src[1 * src_strd] - delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_q1 = CLIP_U8(pu1_src[1 * src_strd] + delta_q);
+ }
+ }
+ }
+
+ if(filter_flag_p != 0)
+ {
+ pu1_src[-3 * src_strd] = tmp_p2;
+ pu1_src[-2 * src_strd] = tmp_p1;
+ pu1_src[-1 * src_strd] = tmp_p0;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu1_src[0 * src_strd] = tmp_q0;
+ pu1_src[1 * src_strd] = tmp_q1;
+ pu1_src[2 * src_strd] = tmp_q2;
+ }
+
+ pu1_src += 1;
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+* Decision process and filtering for the luma block horizontal edge for high bit depth
+*
+* @par Description:
+* The decision process for the luma block horizontal edge is carried out
+* and an appropriate filter is applied. The boundary filter strength, bs
+* should be greater than 0. The pcm flags and the transquant bypass flags
+* should be taken care of by the calling function.
+*
+* @param[in] pu1_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_luma_horz(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth)
+{
+ WORD32 qp_luma, beta_indx, tc_indx;
+ WORD32 beta, tc;
+ WORD32 dp0, dp3, dq0, dq3, d0, d3, dp, dq, d;
+ WORD32 d_sam0, d_sam3;
+ WORD32 de, dep, deq;
+ WORD32 col;
+ WORD32 tmp_p0, tmp_p1, tmp_p2, tmp_q0, tmp_q1, tmp_q2;
+ WORD32 delta, delta_p, delta_q;
+
+ ASSERT((bs > 0));
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+ beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+ /* BS based on implementation can take value 3 if it is intra/inter egde */
+ /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+ /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
+ /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
+
+ tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
+
+ beta = gai4_ihevc_beta_table[beta_indx] * (1 << (bit_depth - 8));
+ tc = gai4_ihevc_tc_table[tc_indx] * (1 << (bit_depth - 8));
+ if(0 == tc)
+ {
+ return;
+ }
+
+ dq0 = ABS(pu2_src[2 * src_strd] - 2 * pu2_src[1 * src_strd] +
+ pu2_src[0 * src_strd]);
+
+ dq3 = ABS(pu2_src[3 + 2 * src_strd] - 2 * pu2_src[3 + 1 * src_strd] +
+ pu2_src[3 + 0 * src_strd]);
+
+ dp0 = ABS(pu2_src[-3 * src_strd] - 2 * pu2_src[-2 * src_strd] +
+ pu2_src[-1 * src_strd]);
+
+ dp3 = ABS(pu2_src[3 - 3 * src_strd] - 2 * pu2_src[3 - 2 * src_strd] +
+ pu2_src[3 - 1 * src_strd]);
+
+ d0 = dp0 + dq0;
+ d3 = dp3 + dq3;
+
+ dp = dp0 + dp3;
+ dq = dq0 + dq3;
+
+ d = d0 + d3;
+
+ de = 0;
+ dep = 0;
+ deq = 0;
+
+ if(d < beta)
+ {
+ d_sam0 = 0;
+ if((2 * d0 < (beta >> 2))
+ && (ABS(pu2_src[3 * src_strd] - pu2_src[0 * src_strd]) +
+ ABS(pu2_src[-1 * src_strd] - pu2_src[-4 * src_strd])
+ < (beta >> 3))
+ && ABS(pu2_src[0 * src_strd] - pu2_src[-1 * src_strd])
+ < ((5 * tc + 1) >> 1))
+ {
+ d_sam0 = 1;
+ }
+
+ pu2_src += 3;
+ d_sam3 = 0;
+ if((2 * d3 < (beta >> 2))
+ && (ABS(pu2_src[3 * src_strd] - pu2_src[0 * src_strd]) +
+ ABS(pu2_src[-1 * src_strd] - pu2_src[-4 * src_strd])
+ < (beta >> 3))
+ && ABS(pu2_src[0 * src_strd] - pu2_src[-1 * src_strd])
+ < ((5 * tc + 1) >> 1))
+ {
+ d_sam3 = 1;
+ }
+ pu2_src -= 3;
+
+ de = (d_sam0 == 1 && d_sam3 == 1) ? 2 : 1;
+ dep = (dp < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+ deq = (dq < ((beta + (beta >> 1)) >> 3)) ? 1 : 0;
+ if(tc <= 1)
+ {
+ dep = 0;
+ deq = 0;
+ }
+ }
+
+ if(de != 0)
+ {
+ for(col = 0; col < 4; col++)
+ {
+ tmp_p0 = pu2_src[-1 * src_strd];
+ tmp_p1 = pu2_src[-2 * src_strd];
+ tmp_p2 = pu2_src[-3 * src_strd];
+
+ tmp_q0 = pu2_src[0 * src_strd];
+ tmp_q1 = pu2_src[1 * src_strd];
+ tmp_q2 = pu2_src[2 * src_strd];
+ if(de == 2)
+ {
+ tmp_q0 = CLIP3((pu2_src[2 * src_strd] +
+ 2 * pu2_src[1 * src_strd] +
+ 2 * pu2_src[0 * src_strd] +
+ 2 * pu2_src[-1 * src_strd] +
+ pu2_src[-2 * src_strd] + 4) >> 3,
+ pu2_src[0 * src_strd] - 2 * tc,
+ pu2_src[0 * src_strd] + 2 * tc);
+
+ tmp_q1 = CLIP3((pu2_src[2 * src_strd] +
+ pu2_src[1 * src_strd] +
+ pu2_src[0 * src_strd] +
+ pu2_src[-1 * src_strd] + 2) >> 2,
+ pu2_src[1 * src_strd] - 2 * tc,
+ pu2_src[1 * src_strd] + 2 * tc);
+
+ tmp_q2 = CLIP3((2 * pu2_src[3 * src_strd] +
+ 3 * pu2_src[2 * src_strd] +
+ pu2_src[1 * src_strd] +
+ pu2_src[0 * src_strd] +
+ pu2_src[-1 * src_strd] + 4) >> 3,
+ pu2_src[2 * src_strd] - 2 * tc,
+ pu2_src[2 * src_strd] + 2 * tc);
+
+ tmp_p0 = CLIP3((pu2_src[1 * src_strd] +
+ 2 * pu2_src[0 * src_strd] +
+ 2 * pu2_src[-1 * src_strd] +
+ 2 * pu2_src[-2 * src_strd] +
+ pu2_src[-3 * src_strd] + 4) >> 3,
+ pu2_src[-1 * src_strd] - 2 * tc,
+ pu2_src[-1 * src_strd] + 2 * tc);
+
+ tmp_p1 = CLIP3((pu2_src[0 * src_strd] +
+ pu2_src[-1 * src_strd] +
+ pu2_src[-2 * src_strd] +
+ pu2_src[-3 * src_strd] + 2) >> 2,
+ pu2_src[-2 * src_strd] - 2 * tc,
+ pu2_src[-2 * src_strd] + 2 * tc);
+
+ tmp_p2 = CLIP3((pu2_src[0 * src_strd] +
+ pu2_src[-1 * src_strd] +
+ pu2_src[-2 * src_strd] +
+ 3 * pu2_src[-3 * src_strd] +
+ 2 * pu2_src[-4 * src_strd] + 4) >> 3,
+ pu2_src[-3 * src_strd] - 2 * tc,
+ pu2_src[-3 * src_strd] + 2 * tc);
+ }
+ else
+ {
+ delta = (9 * (pu2_src[0 * src_strd] - pu2_src[-1 * src_strd]) -
+ 3 * (pu2_src[1 * src_strd] - pu2_src[-2 * src_strd]) +
+ 8) >> 4;
+ if(ABS(delta) < 10 * tc)
+ {
+ delta = CLIP3(delta, -tc, tc);
+ tmp_p0 = CLIP3(pu2_src[-1 * src_strd] + delta, 0, ((1 << bit_depth) - 1));
+ tmp_q0 = CLIP3(pu2_src[0 * src_strd] - delta, 0, ((1 << bit_depth) - 1));
+ if(dep == 1)
+ {
+ delta_p = CLIP3((((pu2_src[-3 * src_strd] +
+ pu2_src[-1 * src_strd] + 1) >> 1) -
+ pu2_src[-2 * src_strd] + delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_p1 = CLIP3(pu2_src[-2 * src_strd] + delta_p, 0, ((1 << bit_depth) - 1));
+ }
+
+ if(deq == 1)
+ {
+ delta_q = CLIP3((((pu2_src[2 * src_strd] +
+ pu2_src[0 * src_strd] + 1) >> 1) -
+ pu2_src[1 * src_strd] - delta) >> 1,
+ -(tc >> 1),
+ (tc >> 1));
+ tmp_q1 = CLIP3(pu2_src[1 * src_strd] + delta_q, 0, ((1 << bit_depth) - 1));
+ }
+ }
+ }
+
+ if(filter_flag_p != 0)
+ {
+ pu2_src[-3 * src_strd] = tmp_p2;
+ pu2_src[-2 * src_strd] = tmp_p1;
+ pu2_src[-1 * src_strd] = tmp_p0;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu2_src[0 * src_strd] = tmp_q0;
+ pu2_src[1 * src_strd] = tmp_q1;
+ pu2_src[2 * src_strd] = tmp_q2;
+ }
+
+ pu2_src += 1;
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Filtering for the chroma block vertical edge.
+*
+* @par Description:
+* Filter for chroma vertical edge. The boundary filter strength, bs
+* should be greater than 1. The pcm flags and the transquant bypass flags
+* should be taken care of by the calling function.
+*
+* @param[in] pu1_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_chroma_vert(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_indx_u, qp_chroma_u;
+ WORD32 qp_indx_v, qp_chroma_v;
+ WORD32 tc_indx_u, tc_u;
+ WORD32 tc_indx_v, tc_v;
+ WORD32 delta_u, tmp_p0_u, tmp_q0_u;
+ WORD32 delta_v, tmp_p0_v, tmp_q0_v;
+ WORD32 row;
+
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ /* chroma processing is done only if BS is 2 */
+ /* this function is assumed to be called only if BS is 2 */
+ qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+ qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+ tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+ tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+ if(0 == tc_u && 0 == tc_v)
+ {
+ return;
+ }
+
+ for(row = 0; row < 4; row++)
+ {
+ delta_u = CLIP3((((pu1_src[0] - pu1_src[-2]) << 2) +
+ pu1_src[-4] - pu1_src[2] + 4) >> 3,
+ -tc_u, tc_u);
+
+ tmp_p0_u = CLIP_U8(pu1_src[-2] + delta_u);
+ tmp_q0_u = CLIP_U8(pu1_src[0] - delta_u);
+
+ delta_v = CLIP3((((pu1_src[1] - pu1_src[-1]) << 2) +
+ pu1_src[-3] - pu1_src[3] + 4) >> 3,
+ -tc_v, tc_v);
+
+ tmp_p0_v = CLIP_U8(pu1_src[-1] + delta_v);
+ tmp_q0_v = CLIP_U8(pu1_src[1] - delta_v);
+
+ if(filter_flag_p != 0)
+ {
+ pu1_src[-2] = tmp_p0_u;
+ pu1_src[-1] = tmp_p0_v;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu1_src[0] = tmp_q0_u;
+ pu1_src[1] = tmp_q0_v;
+ }
+
+ pu1_src += src_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Filtering for the chroma block vertical edge.
+*
+* @par Description:
+* Filter for chroma vertical edge. The boundary filter strength, bs
+* should be greater than 1. The pcm flags and the transquant bypass flags
+* should be taken care of by the calling function.
+*
+* @param[in] pu2_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_chroma_vert(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth)
+{
+ WORD32 qp_indx_u, qp_chroma_u;
+ WORD32 qp_indx_v, qp_chroma_v;
+ WORD32 tc_indx_u, tc_u;
+ WORD32 tc_indx_v, tc_v;
+ WORD32 delta_u, tmp_p0_u, tmp_q0_u;
+ WORD32 delta_v, tmp_p0_v, tmp_q0_v;
+ WORD32 row;
+
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ /* chroma processing is done only if BS is 2 */
+ /* this function is assumed to be called only if BS is 2 */
+ qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+ qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+ tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_u = gai4_ihevc_tc_table[tc_indx_u] * (1 << (bit_depth - 8));
+
+ tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_v = gai4_ihevc_tc_table[tc_indx_v] * (1 << (bit_depth - 8));
+
+ if(0 == tc_u && 0 == tc_v)
+ {
+ return;
+ }
+
+ for(row = 0; row < 4; row++)
+ {
+ delta_u = CLIP3((((pu2_src[0] - pu2_src[-2]) << 2) +
+ pu2_src[-4] - pu2_src[2] + 4) >> 3,
+ -tc_u, tc_u);
+ tmp_p0_u = CLIP3(pu2_src[-2] + delta_u, 0, ((1 << bit_depth) - 1));
+ tmp_q0_u = CLIP3(pu2_src[0] - delta_u, 0, ((1 << bit_depth) - 1));
+
+ delta_v = CLIP3((((pu2_src[1] - pu2_src[-1]) << 2) +
+ pu2_src[-3] - pu2_src[3] + 4) >> 3,
+ -tc_v, tc_v);
+ tmp_p0_v = CLIP3(pu2_src[-1] + delta_v, 0, ((1 << bit_depth) - 1));
+ tmp_q0_v = CLIP3(pu2_src[1] - delta_v, 0, ((1 << bit_depth) - 1));
+ if(filter_flag_p != 0)
+ {
+ pu2_src[-2] = tmp_p0_u;
+ pu2_src[-1] = tmp_p0_v;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu2_src[0] = tmp_q0_u;
+ pu2_src[1] = tmp_q0_v;
+ }
+
+ pu2_src += src_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Filtering for the chroma block horizontal edge.
+*
+* @par Description:
+* Filter for chroma horizontal edge. The boundary filter strength, bs
+* should be greater than 1. The pcm flags and the transquant bypass flags
+* should be taken care of by the calling function.
+*
+* @param[in] pu1_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_chroma_horz(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_indx_u, qp_chroma_u;
+ WORD32 qp_indx_v, qp_chroma_v;
+ WORD32 tc_indx_u, tc_u;
+ WORD32 tc_indx_v, tc_v;
+ WORD32 tc;
+
+ WORD32 delta, tmp_p0, tmp_q0;
+ WORD32 col;
+
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ /* chroma processing is done only if BS is 2 */
+ /* this function is assumed to be called only if BS is 2 */
+ qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+ qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+ tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+ tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+ if(0 == tc_u && 0 == tc_v)
+ {
+ return;
+ }
+
+ for(col = 0; col < 8; col++)
+ {
+ tc = (col & 1) ? tc_v : tc_u;
+ delta = CLIP3((((pu1_src[0 * src_strd] -
+ pu1_src[-1 * src_strd]) << 2) +
+ pu1_src[-2 * src_strd] -
+ pu1_src[1 * src_strd] + 4) >> 3,
+ -tc, tc);
+
+ tmp_p0 = CLIP_U8(pu1_src[-1 * src_strd] + delta);
+ tmp_q0 = CLIP_U8(pu1_src[0 * src_strd] - delta);
+
+ if(filter_flag_p != 0)
+ {
+ pu1_src[-1 * src_strd] = tmp_p0;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu1_src[0 * src_strd] = tmp_q0;
+ }
+
+ pu1_src += 1;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Filtering for the chroma block horizontal edge.
+*
+* @par Description:
+* Filter for chroma horizontal edge. The boundary filter strength, bs
+* should be greater than 1. The pcm flags and the transquant bypass flags
+* should be taken care of by the calling function.
+*
+* @param[in] pu2_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_hbd_deblk_chroma_horz(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q,
+ UWORD8 bit_depth)
+{
+ WORD32 qp_indx_u, qp_chroma_u;
+ WORD32 qp_indx_v, qp_chroma_v;
+ WORD32 tc_indx_u, tc_u;
+ WORD32 tc_indx_v, tc_v;
+ WORD32 tc;
+
+ WORD32 delta, tmp_p0, tmp_q0;
+ WORD32 col;
+
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ /* chroma processing is done only if BS is 2 */
+ /* this function is assumed to be called only if BS is 2 */
+ qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+ qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+ tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_u = gai4_ihevc_tc_table[tc_indx_u] * (1 << (bit_depth - 8));
+
+ tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_v = gai4_ihevc_tc_table[tc_indx_v] * (1 << (bit_depth - 8));
+
+ if(0 == tc_u && 0 == tc_v)
+ {
+ return;
+ }
+
+ for(col = 0; col < 8; col++)
+ {
+ tc = (col & 1) ? tc_v : tc_u;
+ delta = CLIP3((((pu2_src[0 * src_strd] -
+ pu2_src[-1 * src_strd]) << 2) +
+ pu2_src[-2 * src_strd] -
+ pu2_src[1 * src_strd] + 4) >> 3,
+ -tc, tc);
+ tmp_p0 = CLIP3(pu2_src[-1 * src_strd] + delta, 0, ((1 << bit_depth) - 1));
+ tmp_q0 = CLIP3(pu2_src[0 * src_strd] - delta, 0, ((1 << bit_depth) - 1));
+
+ if(filter_flag_p != 0)
+ {
+ pu2_src[-1 * src_strd] = tmp_p0;
+ }
+
+ if(filter_flag_q != 0)
+ {
+ pu2_src[0 * src_strd] = tmp_q0;
+ }
+
+ pu2_src += 1;
+ }
+
+}
diff --git a/common/ihevc_deblk_tables.c b/common/ihevc_deblk_tables.c
new file mode 100644
index 0000000..6fd9e58
--- /dev/null
+++ b/common/ihevc_deblk_tables.c
@@ -0,0 +1,78 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_deblk_tables.c
+*
+* @brief
+* Contains tables used for deblock filters
+*
+* @author
+* Srinivas T
+*
+* @par List of Tables:
+* gai4_ihevc_beta_table
+* gai4_ihevc_tc_table
+* gai4_ihevc_qp_table
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include "ihevc_typedefs.h"
+#include "ihevc_deblk_tables.h"
+
+/**
+ * Beta table for deblocking
+ * Table 8-10 - Derivation of threshold variables beta and tc from input Q
+ */
+const WORD32 gai4_ihevc_beta_table[52] =
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+ 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64
+};
+
+
+/**
+ * Tc table for deblocking
+ * Table 8-10 - Derivation of threshold variables beta and tc from input Q
+ */
+const WORD32 gai4_ihevc_tc_table[54] =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 3, 3, 3, 3, 4,
+ 4, 4, 5, 5, 6, 6, 7, 8, 9,
+ 10, 11, 13, 14, 16, 18, 20, 22, 24
+};
+
+/**
+ * QP table for deblocking
+ * Table 8-9 Specification of QPC as a function of qPi
+ */
+const WORD32 gai4_ihevc_qp_table[58] =
+{
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32,
+ 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51
+};
diff --git a/common/ihevc_deblk_tables.h b/common/ihevc_deblk_tables.h
new file mode 100644
index 0000000..6387881
--- /dev/null
+++ b/common/ihevc_deblk_tables.h
@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_deblk_tables.h
+*
+* @brief
+* Tables for forward and inverse transform
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEBLK_TABLES_H_
+#define _IHEVC_DEBLK_TABLES_H_
+
+extern const WORD32 gai4_ihevc_beta_table[52];
+
+extern const WORD32 gai4_ihevc_tc_table[54];
+
+extern const WORD32 gai4_ihevc_qp_table[58];
+
+#endif /*_IHEVC_DEBLK_TABLES_H_*/
diff --git a/common/ihevc_debug.h b/common/ihevc_debug.h
new file mode 100644
index 0000000..4b87e47
--- /dev/null
+++ b/common/ihevc_debug.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_debug.h
+*
+* @brief
+* Definitions for codec debugging
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEBUG_H_
+#define _IHEVC_DEBUG_H_
+
+#ifdef DEBUG_PRINT
+
+#define DEBUG(...) \
+{ \
+ printf("\n[HEVC DBG] %s/%d:: ", __FUNCTION__, __LINE__); \
+ printf(__VA_ARGS__); \
+}
+
+#else //DEBUG_CODEC
+
+#define DEBUG(...) {}
+
+#endif //DEBUG_CODEC
+
+#if 1
+
+#define ASSERT(x) assert((x))
+//#define ASSERT(x) ihevcd_debug_assert((x))
+
+#else
+#define ASSERT(x) \
+{ \
+ if (!(x)) \
+ { \
+ printf("ASSERT %s %d\n", __FILE__, __LINE__); \
+ exit(-1); \
+ } \
+}
+#endif
+
+#endif /* _IHEVC_DEBUG_H_ */
+
diff --git a/common/ihevc_defs.h b/common/ihevc_defs.h
new file mode 100644
index 0000000..bd92d7d
--- /dev/null
+++ b/common/ihevc_defs.h
@@ -0,0 +1,457 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_defs.h
+*
+* @brief
+* Definitions used in the codec
+*
+* @author
+* Ittiam
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_DEFS_H_
+#define _IHEVC_DEFS_H_
+
+/*****************************************************************************/
+/* Profile and Levels */
+/*****************************************************************************/
+enum
+{
+ IHEVC_PROFILE_MAIN = 0,
+};
+
+enum
+{
+ IHEVC_TIER_MAIN,
+ IHEVC_TIER_HIGH,
+};
+
+
+/* Slice type enums - Do not change the values */
+
+enum
+{
+ BSLICE = 0,
+ PSLICE = 1,
+ ISLICE = 2,
+};
+
+/** Enum for Level : Multiplication by 30 as per spec is implemented as multiplied by 10 and then by 3
+ * User will give level multiplied by 10 in the API
+ */
+enum
+{
+ IHEVC_LEVEL_10 = 10 * 3,
+ IHEVC_LEVEL_20 = 20 * 3,
+ IHEVC_LEVEL_21 = 21 * 3,
+ IHEVC_LEVEL_30 = 30 * 3,
+ IHEVC_LEVEL_31 = 31 * 3,
+ IHEVC_LEVEL_40 = 40 * 3,
+ IHEVC_LEVEL_41 = 41 * 3,
+ IHEVC_LEVEL_50 = 50 * 3,
+ IHEVC_LEVEL_51 = 51 * 3,
+ IHEVC_LEVEL_52 = 52 * 3,
+ IHEVC_LEVEL_60 = 60 * 3,
+ IHEVC_LEVEL_61 = 61 * 3,
+ IHEVC_LEVEL_62 = 62 * 3,
+};
+
+
+enum
+{
+ NAL_TRAIL_N = 0,
+ NAL_TRAIL_R,
+ NAL_TSA_N,
+ NAL_TSA_R,
+ NAL_STSA_N,
+ NAL_STSA_R,
+ NAL_RADL_N,
+ NAL_RADL_R,
+ NAL_RASL_N,
+ NAL_RASL_R,
+ NAL_RSV_VCL_N10 = 10,
+ NAL_RSV_VCL_N12 = 12,
+ NAL_RSV_VCL_N14 = 14,
+ NAL_RSV_VCL_R11 = 11,
+ NAL_RSV_VCL_R13 = 13,
+ NAL_RSV_VCL_R15 = 15,
+
+ NAL_BLA_W_LP = 16,
+ NAL_BLA_W_DLP,
+ NAL_BLA_N_LP,
+ NAL_IDR_W_LP,
+ NAL_IDR_N_LP,
+ NAL_CRA,
+ NAL_RSV_RAP_VCL22 = 22,
+ NAL_RSV_RAP_VCL23 = 23,
+ NAL_RSV_VCL24 = 24,
+ NAL_RSV_VCL31 = 31,
+ NAL_VPS = 32,
+ NAL_SPS,
+ NAL_PPS,
+ NAL_AUD,
+ NAL_EOS,
+ NAL_EOB,
+ NAL_FD,
+ NAL_PREFIX_SEI = 39,
+ NAL_SUFFIX_SEI = 40,
+ NAL_RSV_NVCL41 = 41,
+ NAL_RSV_NVCL47 = 47 ,
+ NAL_UNSPEC48 = 48 ,
+ NAL_UNSPEC63 = 49,
+};
+
+enum
+{
+ CHROMA_FMT_IDC_MONOCHROME = 0,
+ CHROMA_FMT_IDC_YUV420 = 1,
+ CHROMA_FMT_IDC_YUV422 = 2,
+ CHROMA_FMT_IDC_YUV444 = 3,
+ CHROMA_FMT_IDC_YUV444_PLANES = 4,
+};
+
+/* Pred Modes */
+/* Do not change enum values */
+enum
+{
+ PRED_MODE_INTER = 0,
+ PRED_MODE_INTRA = 1,
+ PRED_MODE_SKIP = 2
+};
+
+/* Partition Modes */
+/* Do not change enum values */
+enum
+{
+ PART_2Nx2N = 0,
+ PART_2NxN = 1,
+ PART_Nx2N = 2,
+ PART_NxN = 3,
+ PART_2NxnU = 4,
+ PART_2NxnD = 5,
+ PART_nLx2N = 6,
+ PART_nRx2N = 7
+};
+
+/* Prediction list */
+/* Do not change enum values */
+enum
+{
+ PRED_L0 = 0,
+ PRED_L1 = 1,
+ PRED_BI = 2
+};
+
+/**
+ * Scan types
+ */
+enum
+{
+ SCAN_DIAG_UPRIGHT,
+ SCAN_HORZ,
+ SCAN_VERT
+};
+
+/**
+ * VUI aspect ratio indicator
+ */
+enum
+{
+ SAR_UNUSED = 0,
+ SAR_1_1 = 1,
+ SAR_12_11,
+ SAR_10_11,
+ SAR_16_11,
+ SAR_40_33,
+ SAR_24_11,
+ SAR_20_11,
+ SAR_32_11,
+ SAR_80_33,
+ SAR_18_11,
+ SAR_15_11,
+ SAR_64_33,
+ SAR_160_99,
+ SAR_4_3,
+ SAR_3_2,
+ SAR_2_1,
+ EXTENDED_SAR = 255
+};
+
+enum
+{
+ VID_FMT_COMPONENT = 0,
+ VID_FMT_PAL,
+ VID_FMT_NTSC,
+ VID_FMT_SECAM,
+ VID_FMT_MAC,
+ VID_FMT_UNSPECIFIED
+};
+
+#define BIT_DEPTH 8
+#define BIT_DEPTH_LUMA BIT_DEPTH
+#define BIT_DEPTH_CHROMA BIT_DEPTH
+/*****************************************************************************/
+/* Profile tier level defs */
+/*****************************************************************************/
+#define MAX_PROFILE_COMPATBLTY 32
+
+/*****************************************************************************/
+/* Reference frame defs */
+/*****************************************************************************/
+/* Maximum DPB size */
+#define MAX_DPB_SIZE 16
+
+
+/*****************************************************************************/
+/* VPS restrictions */
+/*****************************************************************************/
+
+/* Number of VPS allowed in Main Profile */
+#define MAX_VPS_CNT 16
+
+/* Max sub layers in VPS */
+#define VPS_MAX_SUB_LAYERS 7
+
+/* Max number of HRD parameters */
+#define VPS_MAX_HRD_PARAMS 2
+
+/* Maximum number of operation point layers */
+#define VPS_MAX_OP_LAYERS 2
+
+
+/*****************************************************************************/
+/* Tile restrictions */
+/*****************************************************************************/
+/* Minimum tile width in Main Profile */
+#define MIN_TILE_WD MIN_CTB_SIZE
+
+/* Minimum tile height in Main Profile */
+#define MIN_TILE_HT MIN_CTB_SIZE
+
+/*****************************************************************************/
+/* SPS restrictions */
+/*****************************************************************************/
+
+/* Number of SPS allowed in Main Profile*/
+/* An extra buffer is allocated to write the parsed data
+ * It is copied to the appropriate location later */
+#define MAX_SPS_CNT (16 + 1)
+
+/* Max sub layers in PPS */
+#define SPS_MAX_SUB_LAYERS 7
+
+/* Maximum long term reference pics */
+#define MAX_LTREF_PICS_SPS 16
+
+#define MAX_STREF_PICS_SPS 64
+
+/*****************************************************************************/
+/* PPS restrictions */
+/*****************************************************************************/
+
+/* Number of PPS allowed in Main Profile */
+/* An extra buffer is allocated to write the parsed data
+ * It is copied to the appropriate location later */
+#define MAX_PPS_CNT (64 + 1)
+
+/*****************************************************************************/
+/* Macro definitions for sizes of CTB, PU, TU, CU */
+/*****************************************************************************/
+
+/* CTB Size Range */
+#define MAX_CTB_SIZE 64
+#define MIN_CTB_SIZE 16
+
+/* TU Size Range */
+#define MAX_TU_SIZE 32
+#define MIN_TU_SIZE 4
+
+/* Max Transform Size */
+#define MAX_TRANS_SIZE (MAX_TU_SIZE*MAX_TU_SIZE)
+
+/* PU Size Range */
+#define MAX_PU_SIZE 64
+#define MIN_PU_SIZE 4
+
+/* CU Size Range */
+#define MAX_CU_SIZE 64
+#define MIN_CU_SIZE 8
+
+
+/* Number of max TU in a CTB row */
+#define MAX_TU_IN_CTB_ROW ((MAX_CTB_SIZE / MIN_TU_SIZE))
+
+/* Number of max TU in a CTB row */
+#define MAX_CU_IN_CTB_ROW ((MAX_CTB_SIZE / MIN_CU_SIZE))
+
+/* Number of max PU in a CTb row */
+#define MAX_PU_IN_CTB_ROW ((MAX_CTB_SIZE / MIN_PU_SIZE))
+
+/* Number of max CU in a CTB */
+#define MAX_CU_IN_CTB ((MAX_CTB_SIZE / MIN_CU_SIZE) * \
+ (MAX_CTB_SIZE / MIN_CU_SIZE))
+
+/* Number of max PU in a CTB */
+/*****************************************************************************/
+/* Note though for 64 x 64 CTB, Max PU in CTB is 128, in order to store */
+/* intra pred info, 256 entries are needed */
+/*****************************************************************************/
+#define MAX_PU_IN_CTB ((MAX_CTB_SIZE / MIN_PU_SIZE) * \
+ (MAX_CTB_SIZE / MIN_PU_SIZE))
+
+/* Number of max TU in a CTB */
+#define MAX_TU_IN_CTB ((MAX_CTB_SIZE / MIN_TU_SIZE) * \
+ (MAX_CTB_SIZE / MIN_TU_SIZE))
+
+
+
+/**
+ * Maximum transform depths
+ */
+#define MAX_TRAFO_DEPTH 5
+
+
+
+
+/* Max number of deblocking edges */
+#define MAX_VERT_DEBLK_EDGES ((MAX_CTB_SIZE/8) * (MAX_CTB_SIZE/4))
+#define MAX_HORZ_DEBLK_EDGES ((MAX_CTB_SIZE/4) * (MAX_CTB_SIZE/8))
+
+/* Qp can not change below 8x8 level */
+#define MAX_DEBLK_QP_CNT ((MAX_CTB_SIZE/8) * (MAX_CTB_SIZE/8))
+
+/*****************************************************************************/
+/* Parsing related macros */
+/*****************************************************************************/
+#define SUBBLK_COEFF_CNT 16
+
+/* Quant and Trans defs */
+
+/*****************************************************************************/
+/* Sizes for Transform functions */
+/*****************************************************************************/
+#define TRANS_SIZE_4 4
+#define TRANS_SIZE_8 8
+#define TRANS_SIZE_16 16
+#define TRANS_SIZE_32 32
+
+
+#define IT_SHIFT_STAGE_1 7
+#define IT_SHIFT_STAGE_2 12
+
+/**
+ * @brief Maximum transform dynamic range (excluding sign bit)
+ */
+#define MAX_TR_DYNAMIC_RANGE 15
+
+/**
+ * @brief Q(QP%6) * IQ(QP%6) = 2^20
+ */
+#define QUANT_IQUANT_SHIFT 20
+
+/**
+ * @brief Q factor for Qp%6 multiplication
+ */
+#define QUANT_SHIFT 14
+
+/**
+ * @brief Q shift factor for flat rescale matrix weights
+ */
+#define FLAT_RESCALE_MAT_Q_SHIFT 11
+
+/**
+ * @brief Scaling matrix is represented in Q15 format
+ */
+#define SCALING_Q_SHIFT 15
+
+/**
+ * @brief rounding factor for quantization represented in Q9 format
+ */
+#define QUANT_ROUND_FACTOR_Q 9
+
+/**
+ * @brief Minimum qp supported in HEVC spec
+ */
+#define MIN_HEVC_QP 0
+
+/**
+ * @brief Maximum qp supported in HEVC spec
+ */
+#define MAX_HEVC_QP 51 //FOR MAIN Branch Encoder
+
+#define MAX_HEVC_QP_10bit 63 //FOR HBD Branch Encoder
+
+
+/**
+ * @brief Total number of transform sizes
+ * used for sizeID while getting scale matrix
+ */
+#define NUM_UNIQUE_TRANS_SIZE 4
+
+/*****************************************************************************/
+/* Number of scaling matrices for each transform size */
+/*****************************************************************************/
+#define SCALE_MAT_CNT_TRANS_SIZE_4 6
+#define SCALE_MAT_CNT_TRANS_SIZE_8 6
+#define SCALE_MAT_CNT_TRANS_SIZE_16 6
+#define SCALE_MAT_CNT_TRANS_SIZE_32 2
+
+/* Maximum number of scale matrices for a given transform size */
+#define SCALE_MAT_CNT_MAX_PER_TRANS_SIZE 6
+
+/* Total number of scale matrices */
+#define TOTAL_SCALE_MAT_COUNT (SCALE_MAT_CNT_TRANS_SIZE_4 + \
+ SCALE_MAT_CNT_TRANS_SIZE_8 + \
+ SCALE_MAT_CNT_TRANS_SIZE_16 + \
+ SCALE_MAT_CNT_TRANS_SIZE_32)
+
+
+/*****************************************************************************/
+/* Intra pred Macros */
+/*****************************************************************************/
+/** Planar Intra prediction mode */
+#define INTRA_PLANAR 0
+
+/** DC Intra prediction mode */
+#define INTRA_DC 1
+
+/** Gives angular mode for intra prediction */
+#define INTRA_ANGULAR(x) (x)
+
+/** Following is used to signal no intra prediction in case of pcm blocks
+ */
+#define INTRA_PRED_NONE 63
+
+
+/** Following is used to signal no intra prediction is needed for first three
+ * 4x4 luma blocks in case of 4x4 TU sizes
+ * Also used in pcm cases
+ */
+#define INTRA_PRED_CHROMA_IDX_NONE 7
+
+
+
+#endif /*__IHEVC_DEFS_H_*/
diff --git a/common/ihevc_disp_mgr.c b/common/ihevc_disp_mgr.c
new file mode 100644
index 0000000..e52b2fc
--- /dev/null
+++ b/common/ihevc_disp_mgr.c
@@ -0,0 +1,188 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_disp_mgr.c
+*
+* @brief
+* Contains function definitions for display management
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_disp_mgr_init()
+* - ihevc_disp_mgr_add()
+* - ihevc_disp_mgr_get()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_disp_mgr.h"
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Initialization function for display buffer manager
+*
+* @par Description:
+* Initializes the display buffer management structure
+*
+* @param[in] ps_disp_mgr
+* Pointer to the display buffer management structure
+*
+* @returns none
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_disp_mgr_init(
+ disp_mgr_t *ps_disp_mgr)
+{
+ WORD32 id;
+
+ ps_disp_mgr->u4_last_abs_poc = DEFAULT_POC;
+
+ for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+ {
+ ps_disp_mgr->ai4_abs_poc[id] = DEFAULT_POC;
+ ps_disp_mgr->apv_ptr[id] = NULL;
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Adds a buffer to the display manager
+*
+* @par Description:
+* Adds a buffer to the display buffer manager
+*
+* @param[in] ps_disp_mgr
+* Pointer to the diaplay buffer management structure
+*
+* @param[in] buf_id
+* ID of the display buffer
+*
+* @param[in] abs_poc
+* Absolute POC of the display buffer
+*
+* @param[in] pv_ptr
+* Pointer to the display buffer
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 ihevc_disp_mgr_add(disp_mgr_t *ps_disp_mgr,
+ WORD32 buf_id,
+ WORD32 abs_poc,
+ void *pv_ptr)
+{
+ if(buf_id >= DISP_MGR_MAX_CNT)
+ {
+ return (-1);
+ }
+
+ if(ps_disp_mgr->apv_ptr[buf_id] != NULL)
+ {
+ return (-1);
+ }
+
+ ps_disp_mgr->apv_ptr[buf_id] = pv_ptr;
+ ps_disp_mgr->ai4_abs_poc[buf_id] = abs_poc;
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the next buffer
+*
+* @par Description:
+* Gets the next display buffer
+*
+* @param[in] ps_disp_mgr
+* Pointer to the display buffer structure
+*
+* @param[out] pi4_buf_id
+* Pointer to hold buffer id of the display buffer being returned
+*
+* @returns Pointer to the next display buffer
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* ihevc_disp_mgr_get(
+ disp_mgr_t *ps_disp_mgr,
+ WORD32 *pi4_buf_id)
+{
+ WORD32 id;
+ void *pv_ret_ptr;
+ WORD32 i4_min_poc;
+ WORD32 min_poc_id;
+
+
+ pv_ret_ptr = NULL;
+ i4_min_poc = 0x7FFFFFFF;
+ min_poc_id = -1;
+
+ /* Find minimum POC */
+ for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+ {
+ if((DEFAULT_POC != ps_disp_mgr->ai4_abs_poc[id]) &&
+ (ps_disp_mgr->ai4_abs_poc[id] <= i4_min_poc))
+ {
+ i4_min_poc = ps_disp_mgr->ai4_abs_poc[id];
+ min_poc_id = id;
+ }
+ }
+ *pi4_buf_id = min_poc_id;
+ /* If all pocs are still default_poc then return NULL */
+ if(-1 == min_poc_id)
+ {
+ return NULL;
+ }
+
+ pv_ret_ptr = ps_disp_mgr->apv_ptr[min_poc_id];
+
+ /* Set abs poc to default and apv_ptr to null so that the buffer is not returned again */
+ ps_disp_mgr->apv_ptr[min_poc_id] = NULL;
+ ps_disp_mgr->ai4_abs_poc[min_poc_id] = DEFAULT_POC;
+ return pv_ret_ptr;
+}
diff --git a/common/ihevc_disp_mgr.h b/common/ihevc_disp_mgr.h
new file mode 100644
index 0000000..aa5bd29
--- /dev/null
+++ b/common/ihevc_disp_mgr.h
@@ -0,0 +1,71 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_disp_mgr.h
+*
+* @brief
+* Function declarations used for display management
+*
+* @author
+* Srinivas T
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _DISP_MGR_H_
+#define _DISP_MGR_H_
+
+#define DISP_MGR_MAX_CNT 64
+#define DEFAULT_POC 0x7FFFFFFF
+
+typedef struct
+{
+ /**
+ * last_abs_poc
+ */
+ UWORD32 u4_last_abs_poc;
+
+ /**
+ * au4_abs_poc[DISP_MGR_MAX_CNT]
+ */
+ WORD32 ai4_abs_poc[DISP_MGR_MAX_CNT];
+
+ /**
+ * apv_ptr[DISP_MGR_MAX_CNT]
+ */
+ void *apv_ptr[DISP_MGR_MAX_CNT];
+}disp_mgr_t;
+
+void ihevc_disp_mgr_init(
+ disp_mgr_t *ps_disp_mgr);
+
+WORD32 ihevc_disp_mgr_add(
+ disp_mgr_t *ps_disp_mgr,
+ WORD32 id,
+ WORD32 abs_poc,
+ void *pv_ptr);
+
+void* ihevc_disp_mgr_get(disp_mgr_t *ps_disp_mgr,
+ WORD32 *pi4_buf_id);
+
+#endif //_DISP_MGR_H_
diff --git a/common/ihevc_dpb_mgr.c b/common/ihevc_dpb_mgr.c
new file mode 100644
index 0000000..7a2e032
--- /dev/null
+++ b/common/ihevc_dpb_mgr.c
@@ -0,0 +1,506 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_dpb_mgr.c
+ *
+ * @brief
+ * Function definitions used for decoded picture buffer management
+ *
+ * @author
+ * Srinivas T
+ *
+ * @par List of Functions:
+ * - ihevc_dpb_mgr_init()
+ * - ihevc_dpb_mgr_del_lt()
+ * - ihevc_dpb_mgr_insert_lt()
+ * - ihevc_dpb_mgr_del_st_or_make_lt()
+ * - ihevc_dpb_mgr_insert_st()
+ * - ihevc_dpb_mgr_reset()
+ * - ihevc_dpb_mgr_release_pics()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * DPB manager initializer
+ *
+ * @par Description:
+ * Initialises the DPB manager structure
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr)
+{
+ UWORD32 i;
+ dpb_info_t *ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ ps_dpb_info[i].ps_prev_dpb = NULL;
+ ps_dpb_info[i].ps_pic_buf = NULL;
+
+ }
+
+ ps_dpb_mgr->u1_num_ref_bufs = 0;
+ ps_dpb_mgr->ps_dpb_head = NULL;
+
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Adds a reference picture into the linked list
+ *
+ * @par Description:
+ * Adds the reference buffer with the given buffer id into the DPB manager
+ *
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to the DPB manager structure
+ *
+ * @param[in] ps_picBuf
+ * Pointer to the picture buffer
+ *
+ * @param[in] buf_id
+ * buffer id of the picture buffer
+ *
+ * @returns 0 if successful, -1 otherwise
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+WORD32 ihevc_dpb_mgr_insert_ref(dpb_mgr_t *ps_dpb_mgr,
+ pic_buf_t *ps_pic_buf,
+ WORD32 buf_id)
+{
+ int i;
+ dpb_info_t *ps_dpb_info;
+
+ ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+
+ /* Return error if buffer is already present in the DPB */
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if((ps_dpb_info[i].ps_pic_buf == ps_pic_buf)
+ && (ps_dpb_info[i].ps_pic_buf->u1_used_as_ref))
+ {
+ return (-1);
+ }
+
+
+ }
+
+ /* Find an unused DPB location */
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(NULL == ps_dpb_info[i].ps_pic_buf)
+ {
+ break;
+ }
+ }
+ if(i == MAX_DPB_BUFS)
+ {
+ return (-1);
+ }
+
+ /* Create DPB info */
+ ps_dpb_info[i].ps_pic_buf = ps_pic_buf;
+ ps_dpb_info[i].ps_prev_dpb = ps_dpb_mgr->ps_dpb_head;
+ ps_dpb_info[i].ps_pic_buf->u1_buf_id = buf_id;
+ ps_dpb_info[i].ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+ /* update the head node of linked list to point to the current picture */
+ ps_dpb_mgr->ps_dpb_head = ps_dpb_info + i;
+
+ /* Increment Short term buffer count */
+ ps_dpb_mgr->u1_num_ref_bufs++;
+
+ return 0;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Deletes a reference buffer from the dpb manager
+ *
+ * @par Description:
+ * Delete short term reference with a given POC from the linked
+ * list
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to DPB Manager structure
+ *
+ * @param[in] ps_buf_mgr
+ * Pointer to buffer manager structure
+ *
+ * @param[in] u4_abs_poc
+ * Node's absolute poc
+ *
+ *
+ * @returns 0 if successful, -1 otherwise
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_del_ref(dpb_mgr_t *ps_dpb_mgr,
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 i4_abs_poc)
+{
+ int i;
+ dpb_info_t *ps_next_dpb;
+
+ dpb_info_t *ps_unmark_node;
+ UWORD8 u1_del_node;
+ UNUSED(u1_del_node);
+ u1_del_node = 0;
+
+ /* Find the node with matching absolute POC */
+ ps_next_dpb = ps_dpb_mgr->ps_dpb_head;
+ if(ps_next_dpb->ps_pic_buf->i4_abs_poc == i4_abs_poc)
+ {
+ ps_unmark_node = ps_next_dpb;
+ }
+ else
+ {
+ for(i = 1; i < ps_dpb_mgr->u1_num_ref_bufs; i++)
+ {
+ if(ps_next_dpb->ps_prev_dpb->ps_pic_buf->i4_abs_poc == i4_abs_poc)
+ break;
+ ps_next_dpb = ps_next_dpb->ps_prev_dpb;
+ }
+
+ if(i == ps_dpb_mgr->u1_num_ref_bufs)
+ {
+ return;
+ }
+ else
+ ps_unmark_node = ps_next_dpb->ps_prev_dpb;
+ }
+
+ if(ps_unmark_node == ps_dpb_mgr->ps_dpb_head)
+ {
+ ps_dpb_mgr->ps_dpb_head = ps_unmark_node->ps_prev_dpb;
+ }
+ else
+ {
+ ps_next_dpb->ps_prev_dpb = ps_unmark_node->ps_prev_dpb; //update link
+ ps_unmark_node->ps_prev_dpb = NULL;
+ }
+ ps_dpb_mgr->u1_num_ref_bufs--; //decrement buffer count
+
+ /* Release the physical buffer */
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_buf_mgr, ps_unmark_node->ps_pic_buf->u1_buf_id,
+ BUF_MGR_REF);
+ ps_unmark_node->ps_prev_dpb = NULL;
+ ps_unmark_node->ps_pic_buf = NULL;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Gets a buffer with abs_poc closest to the current poc
+ *
+ * @par Description:
+ * Returns the pointer to the picture buffer whose poc is equal to abs_poc
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to DPB Manager structure
+ *
+ * @param[out] ps_pic_buf
+ * Pointer to picture buffer
+
+ * @param[in] abs_poc
+ * poc of the buffer to be returned
+ *
+ * @returns
+ * 0 if successful, pic_buf otherwise
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_nearest_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 cur_abs_poc)
+{
+ WORD32 i;
+ WORD32 min_diff = 0x7FFFFFFF;
+ pic_buf_t *ps_pic_buf = NULL;
+
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) &&
+ (ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref != UNUSED_FOR_REF))
+ {
+ WORD32 poc_diff = cur_abs_poc - ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->i4_abs_poc;
+ if((poc_diff > 0) && (poc_diff < min_diff))
+ {
+ min_diff = poc_diff;
+ ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+ }
+ }
+ }
+
+ if(NULL == ps_pic_buf)
+ {
+ min_diff = 0x7FFFFFFF;
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) &&
+ (ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref != UNUSED_FOR_REF))
+ {
+ WORD32 poc_diff = cur_abs_poc - ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->i4_abs_poc;
+ if(ABS(poc_diff) < min_diff)
+ {
+ min_diff = ABS(poc_diff);
+ ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+ }
+ }
+ }
+ }
+
+ return ps_pic_buf;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Gets a buffer with abs_poc
+ *
+ * @par Description:
+ * Returns the pointer to the picture buffer whose poc is equal to abs_poc
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to DPB Manager structure
+ *
+ * @param[out] ps_pic_buf
+ * Pointer to picture buffer
+
+ * @param[in] abs_poc
+ * poc of the buffer to be returned
+ *
+ * @returns
+ * 0 if successful, pic_buf otherwise
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 abs_poc)
+{
+ UWORD32 i;
+ dpb_info_t *ps_next_ref;
+ pic_buf_t *ps_pic_buf = NULL;
+
+
+ ps_next_ref = ps_dpb_mgr->ps_dpb_head;
+ for(i = 0; i < ps_dpb_mgr->u1_num_ref_bufs; i++)
+ {
+ if(ps_next_ref->ps_pic_buf->i4_abs_poc == abs_poc)
+ {
+ ps_pic_buf = ps_next_ref->ps_pic_buf;
+ break;
+ }
+
+ ps_next_ref = ps_next_ref->ps_prev_dpb;
+ }
+
+ if(i == ps_dpb_mgr->u1_num_ref_bufs)
+ {
+ ps_pic_buf = NULL;
+ }
+
+ return ps_pic_buf;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Gets a buffer with poc_lsb
+ *
+ * @par Description:
+ * Returns the pointer to the picture buffer whose poc is equal to poc_lsb
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to DPB Manager structure
+ *
+ * @param[out] ps_pic_buf
+ * Pointer to picture buffer
+
+ * @param[in] poc_lsb
+ * poc_lsb of the buffer to be returned
+ *
+ * @returns
+ * 0 if successful, pic_buf otherwise
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc_lsb(dpb_mgr_t *ps_dpb_mgr, WORD32 poc_lsb)
+{
+ pic_buf_t *ps_pic_buf = NULL;
+ UWORD32 i;
+ dpb_info_t *ps_next_ref;
+
+ ps_next_ref = ps_dpb_mgr->ps_dpb_head;
+ for(i = 0; i < ps_dpb_mgr->u1_num_ref_bufs; i++)
+ {
+ if(ps_next_ref->ps_pic_buf->i4_poc_lsb == poc_lsb)
+ {
+ ps_pic_buf = ps_next_ref->ps_pic_buf;
+ break;
+ }
+
+ ps_next_ref = ps_next_ref->ps_prev_dpb;
+ }
+
+ if(i == ps_dpb_mgr->u1_num_ref_bufs)
+ {
+ ps_pic_buf = NULL;
+ }
+
+ return ps_pic_buf;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Resets the DPB manager
+ *
+ * @par Description:
+ * Re-initialises the DPB manager structure
+ *
+ * @param[in] ps_dpb_mgr
+ * Pointer to DPB Manager structure
+ *
+ * @param[in] ps_buf_mgr
+ * Pointer to buffer manager structure
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr)
+{
+ int i;
+ dpb_info_t *ps_dpb_info;
+
+ ps_dpb_info = ps_dpb_mgr->as_dpb_info;
+
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(ps_dpb_info[i].ps_pic_buf->u1_used_as_ref)
+ {
+ ps_dpb_info[i].ps_pic_buf->u1_used_as_ref = UNUSED_FOR_REF;
+ ps_dpb_info[i].ps_prev_dpb = NULL;
+ //Release physical buffer
+ ihevc_buf_mgr_release(ps_buf_mgr, ps_dpb_info[i].ps_pic_buf->u1_buf_id,
+ BUF_MGR_REF);
+
+ ps_dpb_info[i].ps_pic_buf = NULL;
+ }
+ }
+ ps_dpb_mgr->u1_num_ref_bufs = 0;
+ ps_dpb_mgr->ps_dpb_head = NULL;
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * deletes all pictures from DPB
+ *
+ * @par Description:
+ * Deletes all pictures present in the DPB manager
+ *
+ * @param[in] ps_buf_mgr
+ * Pointer to buffer manager structure
+ *
+ * @param[in] u1_disp_bufs
+ * Number of buffers to be deleted
+ *
+ * @returns
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+void ihevc_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs)
+{
+ WORD8 i;
+ UWORD32 buf_status;
+
+ for(i = 0; i < u1_disp_bufs; i++)
+ {
+ buf_status = ihevc_buf_mgr_get_status(ps_buf_mgr, i);
+ if(0 != buf_status)
+ {
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_buf_mgr, i, BUF_MGR_REF);
+ }
+ }
+}
diff --git a/common/ihevc_dpb_mgr.h b/common/ihevc_dpb_mgr.h
new file mode 100644
index 0000000..bf60413
--- /dev/null
+++ b/common/ihevc_dpb_mgr.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_dpb_mgr.h
+ *
+ * @brief
+ * Function declarations used for decoded picture buffer management
+ *
+ * @author
+ * Srinivas T
+ *
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#ifndef _DPB_MANAGER_H
+#define _DPB_MANAGER_H
+
+/* Temporary definitions. Have to be defined later */
+
+#define MAX_DPB_BUFS (MAX_DPB_SIZE * 4)
+
+#define MARK_ST_PICNUM_AS_NONREF 1
+#define MARK_LT_INDEX_AS_NONREF 2
+#define MARK_ST_PICNUM_AS_LT_INDEX 3
+#define RESET_REF_PICTURES 5
+
+typedef struct dpb_info_t dpb_info_t;
+
+enum
+{
+ UNUSED_FOR_REF = 0,
+ LONG_TERM_REF,
+ SHORT_TERM_REF,
+};
+struct dpb_info_t
+{
+ /**
+ * Pointer to picture buffer structure
+ */
+ pic_buf_t *ps_pic_buf;
+
+ /**
+ * Link to the DPB buffer with previous pic Num
+ */
+ dpb_info_t *ps_prev_dpb;
+
+};
+
+typedef struct
+{
+ /**
+ * Pointer to the most recent pic Num
+ */
+ dpb_info_t *ps_dpb_head;
+
+ /**
+ * Physical storage for dpbInfo for ref bufs
+ */
+ dpb_info_t as_dpb_info[MAX_DPB_BUFS];
+
+ /**
+ * Number of reference buffers
+ */
+ UWORD8 u1_num_ref_bufs;
+
+}dpb_mgr_t;
+
+void ihevc_dpb_mgr_init(dpb_mgr_t *ps_dpb_mgr);
+
+WORD32 ihevc_dpb_mgr_insert_ref(dpb_mgr_t *ps_dpb_mgr,
+ pic_buf_t *ps_pic_buf,
+ WORD32 buf_id);
+
+void ihevc_dpb_mgr_del_ref(dpb_mgr_t *ps_dpb_mgr,
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 u4_abs_poc);
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_nearest_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 cur_abs_poc);
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc(dpb_mgr_t *ps_dpb_mgr, WORD32 abs_poc);
+
+pic_buf_t* ihevc_dpb_mgr_get_ref_by_poc_lsb(dpb_mgr_t *ps_dpb_mgr, WORD32 poc_lsb);
+
+void ihevc_dpb_mgr_reset(dpb_mgr_t *ps_dpb_mgr, buf_mgr_t *ps_buf_mgr);
+
+void ihevc_dpb_mgr_release_pics(buf_mgr_t *ps_buf_mgr, UWORD8 u1_disp_bufs);
+
+#endif /* _DPB_MANAGER_H */
diff --git a/common/ihevc_error.h b/common/ihevc_error.h
new file mode 100644
index 0000000..38eeccd
--- /dev/null
+++ b/common/ihevc_error.h
@@ -0,0 +1,65 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_error.h
+*
+* @brief
+* Definitions related to error handling for common modules
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_ERROR_H_
+#define _IHEVC_ERROR_H_
+
+/**
+ * Enumerations for error codes used in the codec.
+ * Not all these are expected to be returned to the application.
+ * Only select few will be exported
+ */
+typedef enum
+{
+ /**
+ * No error
+ */
+ IHEVC_SUCCESS = 0,
+ /**
+ * Start error code for decoder
+ */
+ IHEVC_DEC_ERROR_START = 0x100,
+
+ /**
+ * Start error code for encoder
+ */
+ IHEVC_ENC_ERROR_START = 0x200,
+ /**
+ * Generic failure
+ */
+ IHEVC_FAIL = 0x7FFFFFFF
+}IHEVC_ERROR_T;
+#endif /* _IHEVC_ERROR_H_ */
diff --git a/common/ihevc_func_types.h b/common/ihevc_func_types.h
new file mode 100644
index 0000000..f5a2c44
--- /dev/null
+++ b/common/ihevc_func_types.h
@@ -0,0 +1,69 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_func_types.h
+*
+* @brief
+* Defines different types of function implementations Eg C, Cortex A8
+* Intrinsics, Neon assembly etc
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_FUNC_TYPES_H_
+#define _IHEVC_FUNC_TYPES_H_
+
+
+/* C Model : No platform specific intrinsics or inline assemblies */
+#define C 0
+
+/* Cortex Ax intrinsics */
+#define CXAINTR 10
+
+/* Neon intrinsics */
+#define NEONINTR 0
+
+/* X86 intrinsics */
+#define X86INTR 12
+
+/* X64 intrinsics */
+#define X64INTR 13
+
+/* Atom intrinsics */
+#define ATOMINTR 14
+
+/* Cortex Ax assembly */
+#define CXAASM 20
+
+/* Neon assembly */
+#define NEONASM 21
+
+/* X86 assembly */
+#define X86ASM 22
+
+
+#endif /* _IHEVC_FUNC_TYPES_H_ */
diff --git a/common/ihevc_inter_pred.h b/common/ihevc_inter_pred.h
new file mode 100644
index 0000000..b6cca71
--- /dev/null
+++ b/common/ihevc_inter_pred.h
@@ -0,0 +1,403 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_inter_pred.h
+*
+* @brief
+* Declarations for the fucntions defined in ihevc_inter_pred_ft.c
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_INTER_PRED_H_
+#define _IHEVC_INTER_PRED_H_
+
+#define NTAPS_LUMA 8
+#define NTAPS_CHROMA 4
+#define SHIFT_14_MINUS_BIT_DEPTH (14 - BIT_DEPTH)
+#define OFFSET_14_MINUS_BIT_DEPTH (1 << (SHIFT_14_MINUS_BIT_DEPTH - 1))
+#define OFFSET14 (1 << (14 - 1))
+#define FILTER_PREC 6
+
+#define REF_WIDTH 1280
+#define REF_HEIGHT 720
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+
+typedef void ihevc_inter_pred_ft(
+ UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_inter_pred_w16out_ft(
+ UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_inter_pred_w16inp_ft(
+ WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_inter_pred_w16inp_w16out_ft(
+ WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd);
+
+
+typedef void ihevc_hbd_inter_pred_ft(UWORD16 *pu2_src,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_inter_pred_w16out_ft(
+ UWORD16 *pu2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_inter_pred_w16inp_ft(
+ WORD16 *pi2_src,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_inter_pred_w16inp_w16out_ft(
+ WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_weighted_pred_uni_ft(
+ WORD16 *pi2_src,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_weighted_pred_bi_ft(
+ WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 wgt1,
+ WORD32 off1,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_weighted_pred_bi_default_ft(
+ WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+typedef void ihevc_hbd_weighted_pred_chroma_uni_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_weighted_pred_chroma_bi_ft(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 wgt1_cb,
+ WORD32 wgt1_cr,
+ WORD32 off1_cb,
+ WORD32 off1_cr,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_weighted_pred_chroma_bi_default_ft(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd,
+ UWORD8 bit_depth);
+/* C function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out;
+
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_copy;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_horz;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_vert;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_copy_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_horz_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16out;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_luma_vert_w16inp;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16inp_w16out;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_copy;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_horz;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_vert;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_copy_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_horz_w16out;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16out;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_chroma_vert_w16inp;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_w16out;
+ihevc_hbd_weighted_pred_uni_ft ihevc_hbd_weighted_pred_uni;
+ihevc_hbd_weighted_pred_bi_ft ihevc_hbd_weighted_pred_bi;
+ihevc_hbd_weighted_pred_bi_default_ft ihevc_hbd_weighted_pred_bi_default;
+ihevc_hbd_weighted_pred_chroma_uni_ft ihevc_hbd_weighted_pred_chroma_uni;
+ihevc_hbd_weighted_pred_chroma_bi_ft ihevc_hbd_weighted_pred_chroma_bi;
+ihevc_hbd_weighted_pred_chroma_bi_default_ft ihevc_hbd_weighted_pred_chroma_bi_default;
+
+/* A9 Q function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_a9q;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_a9q;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_a9q;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_a9q;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_a9q;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_a9q;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q;
+
+/* A9 A function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_a9a;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_a9a;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_a9a;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_a9a;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_a9a;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_a9a;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_a9a;
+
+/* NEONINTR function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_neonintr;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_neonintr;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_neonintr;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_neonintr;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_neonintr;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_neonintr;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_neonintr;
+
+/* SSSE31 function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_ssse3;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_ssse3;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_ssse3;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_ssse3;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_ssse3;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_ssse3;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3;
+
+/* SSE42 function declarations */
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_sse42;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_sse42;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_sse42;
+
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_copy_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_horz_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_luma_vert_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_copy_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_horz_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16out_sse42;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_luma_vert_w16inp_sse42;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_luma_vert_w16inp_w16out_sse42;
+
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_copy_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_horz_sse42;
+ihevc_hbd_inter_pred_ft ihevc_hbd_inter_pred_chroma_vert_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_copy_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_horz_w16out_sse42;
+ihevc_hbd_inter_pred_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16out_sse42;
+ihevc_hbd_inter_pred_w16inp_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_sse42;
+ihevc_hbd_inter_pred_w16inp_w16out_ft ihevc_hbd_inter_pred_chroma_vert_w16inp_w16out_sse42;
+
+ihevc_hbd_weighted_pred_uni_ft ihevc_hbd_weighted_pred_uni_sse42;
+ihevc_hbd_weighted_pred_bi_ft ihevc_hbd_weighted_pred_bi_sse42;
+ihevc_hbd_weighted_pred_bi_default_ft ihevc_hbd_weighted_pred_bi_default_sse42;
+ihevc_hbd_weighted_pred_chroma_uni_ft ihevc_hbd_weighted_pred_chroma_uni_sse42;
+ihevc_hbd_weighted_pred_chroma_bi_ft ihevc_hbd_weighted_pred_chroma_bi_sse42;
+ihevc_hbd_weighted_pred_chroma_bi_default_ft ihevc_hbd_weighted_pred_chroma_bi_default_sse42;
+
+#ifndef DISABLE_AVX2
+/* AVX2 function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_avx2;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_avx2;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_avx2;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_avx2;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_avx2;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_avx2;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_avx2;
+#endif
+
+/* armv8 function declarations */
+ihevc_inter_pred_ft ihevc_inter_pred_luma_copy_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_horz_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_luma_vert_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_copy_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_horz_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_luma_vert_w16out_av8;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_luma_vert_w16inp_av8;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_luma_vert_w16inp_w16out_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_copy_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_horz_av8;
+ihevc_inter_pred_ft ihevc_inter_pred_chroma_vert_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_copy_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_horz_w16out_av8;
+ihevc_inter_pred_w16out_ft ihevc_inter_pred_chroma_vert_w16out_av8;
+ihevc_inter_pred_w16inp_ft ihevc_inter_pred_chroma_vert_w16inp_av8;
+ihevc_inter_pred_w16inp_w16out_ft ihevc_inter_pred_chroma_vert_w16inp_w16out_av8;
+#endif /*_IHEVC_INTER_PRED_H_*/
diff --git a/common/ihevc_inter_pred_filters.c b/common/ihevc_inter_pred_filters.c
new file mode 100644
index 0000000..717bb53
--- /dev/null
+++ b/common/ihevc_inter_pred_filters.c
@@ -0,0 +1,1214 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_inter_pred_filters.c
+*
+* @brief
+* Contains function definitions for inter prediction interpolation filters
+*
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_inter_pred_luma_copy()
+* - ihevc_inter_pred_luma_horz()
+* - ihevc_inter_pred_luma_vert()
+* - ihevc_inter_pred_luma_copy_w16out()
+* - ihevc_inter_pred_luma_horz_w16out()
+* - ihevc_inter_pred_luma_vert_w16out()
+* - ihevc_inter_pred_luma_vert_w16inp()
+* - ihevc_inter_pred_luma_vert_w16inp_w16out()
+* - ihevc_inter_pred_chroma_copy()
+* - ihevc_inter_pred_chroma_horz()
+* - ihevc_inter_pred_chroma_vert()
+* - ihevc_inter_pred_chroma_copy_w16out()
+* - ihevc_inter_pred_chroma_horz_w16out()
+* - ihevc_inter_pred_chroma_vert_w16out()
+* - ihevc_inter_pred_chroma_vert_w16inp()
+* - ihevc_inter_pred_chroma_vert_w16inp_w16out()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+
+#include "ihevc_inter_pred.h"
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma function for copy
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_copy(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UNUSED(pi1_coeff);
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ pu1_dst[col] = pu1_src[col];
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for horizontal input
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_horz(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i2_tmp = 0;
+ for(i = 0; i < NTAPS_LUMA; i++)
+ i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3)];
+
+ i2_tmp = (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+ i2_tmp = CLIP_U8(i2_tmp);
+
+ pu1_dst[col] = (UWORD8)i2_tmp;
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for vertical input
+*
+* @par Description:
+* Applies a vertcal filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_vert(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i2_tmp = 0;
+ for(i = 0; i < NTAPS_LUMA; i++)
+ i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3) * src_strd];
+
+ i2_tmp = (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+ i2_tmp = CLIP_U8(i2_tmp);
+
+ pu1_dst[col] = (UWORD8)i2_tmp;
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for copy 16bit output
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst' The output is upshifted by 6
+* bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_copy_w16out(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UNUSED(pi1_coeff);
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH);
+ }
+
+ pu1_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for horizontal 16bit output
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' No downshifting or clipping is done and the output is used
+* as an input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_horz_w16out(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i2_tmp = 0;
+ for(i = 0; i < NTAPS_LUMA; i++)
+ i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3)];
+
+ pi2_dst[col] = i2_tmp;
+ }
+
+ pu1_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for vertical 16bit output
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' No downshifting or clipping is done and the output is used as
+* an input for weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_vert_w16out(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i2_tmp = 0;
+ for(i = 0; i < NTAPS_LUMA; i++)
+ i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 3) * src_strd];
+
+ pi2_dst[col] = i2_tmp;
+ }
+
+ pu1_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+* Luma vertical filter for 16bit input.
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 12 and
+* clipped to lie between 0 and 255
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_vert_w16inp(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i4_tmp = 0;
+ for(i = 0; i < NTAPS_LUMA; i++)
+ i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 3) * src_strd];
+
+ i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+ i4_tmp = CLIP_U8(i4_tmp);
+
+ pu1_dst[col] = i4_tmp;
+ }
+
+ pi2_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Luma prediction filter for vertical 16bit input & output
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 6 and
+* 8192 is subtracted to store it as a 16 bit number The output is used as
+* a input to weighted prediction
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_vert_w16inp_w16out(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i4_tmp = 0;
+ for(i = 0; i < NTAPS_LUMA; i++)
+ i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 3) * src_strd];
+
+ i4_tmp = (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) - OFFSET14;
+
+ pi2_dst[col] = i4_tmp;
+ }
+
+ pi2_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma interprediction filter for copy
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_copy(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UNUSED(pi1_coeff);
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ pu1_dst[col] = pu1_src[col];
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma interprediction filter for horizontal input
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_horz(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp_u, i2_tmp_v;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col += 2)
+ {
+ i2_tmp_u = 0;
+ i2_tmp_v = 0;
+ for(i = 0; i < NTAPS_CHROMA; i++)
+ {
+ i2_tmp_u += pi1_coeff[i] * pu1_src[col + (i - 1) * 2];
+ i2_tmp_v += pi1_coeff[i] * pu1_src[col + 1 + (i - 1) * 2];
+ }
+
+ i2_tmp_u = (i2_tmp_u + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+ i2_tmp_u = CLIP_U8(i2_tmp_u);
+ i2_tmp_v = (i2_tmp_v + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+ i2_tmp_v = CLIP_U8(i2_tmp_v);
+
+
+ pu1_dst[col] = (UWORD8)i2_tmp_u;
+ pu1_dst[col + 1] = (UWORD8)i2_tmp_v;
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma interprediction filter for vertical input
+*
+* @par Description:
+* Applies a vertcal filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_vert(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ i2_tmp = 0;
+ for(i = 0; i < NTAPS_CHROMA; i++)
+ {
+ i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 1) * src_strd];
+ }
+
+ i2_tmp = (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+ i2_tmp = CLIP_U8(i2_tmp);
+
+ pu1_dst[col] = (UWORD8)i2_tmp;
+ }
+
+ pu1_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* chroma interprediction filter for copying 16bit output
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst' The output is upshifted by 6
+* bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_copy_w16out(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UNUSED(pi1_coeff);
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH);
+ }
+
+ pu1_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* chroma interprediction filter to store horizontal 16bit ouput
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' No downshifting or clipping is done and the output is used
+* as an input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_horz_w16out(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp_u, i2_tmp_v;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col += 2)
+ {
+ i2_tmp_u = 0;
+ i2_tmp_v = 0;
+ for(i = 0; i < NTAPS_CHROMA; i++)
+ {
+ i2_tmp_u += pi1_coeff[i] * pu1_src[col + (i - 1) * 2];
+ i2_tmp_v += pi1_coeff[i] * pu1_src[col + 1 + (i - 1) * 2];
+ }
+
+ pi2_dst[col] = i2_tmp_u;
+ pi2_dst[col + 1] = i2_tmp_v;
+ }
+
+ pu1_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction chroma filter to store vertical 16bit ouput
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' No downshifting or clipping is done and the output is used as
+* an input for weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_chroma_vert_w16out(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD16 i2_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ i2_tmp = 0;
+ for(i = 0; i < NTAPS_CHROMA; i++)
+ {
+ i2_tmp += pi1_coeff[i] * pu1_src[col + (i - 1) * src_strd];
+ }
+
+ pi2_dst[col] = i2_tmp;
+ }
+
+ pu1_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* chroma interprediction filter for vertical 16bit input
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 12 and
+* clipped to lie between 0 and 255
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_vert_w16inp(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ i4_tmp = 0;
+ for(i = 0; i < NTAPS_CHROMA; i++)
+ {
+ i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 1) * src_strd];
+ }
+
+ i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH;
+ i4_tmp = CLIP_U8(i4_tmp);
+
+ pu1_dst[col] = i4_tmp;
+ }
+
+ pi2_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+* Chroma interprediction filter for 16bit vertical input and output.
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 6 and
+* 8192 is subtracted to store it as a 16 bit number The output is used as
+* a input to weighted prediction
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_vert_w16inp_w16out(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, i;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ i4_tmp = 0;
+ for(i = 0; i < NTAPS_CHROMA; i++)
+ {
+ i4_tmp += pi1_coeff[i] * pi2_src[col + (i - 1) * src_strd];
+ }
+
+ i4_tmp = (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH);
+
+ pi2_dst[col] = i4_tmp;
+ }
+
+ pi2_src += src_strd;
+ pi2_dst += dst_strd;
+ }
+
+}
+
+
diff --git a/common/ihevc_intra_pred.h b/common/ihevc_intra_pred.h
new file mode 100644
index 0000000..a29e99d
--- /dev/null
+++ b/common/ihevc_intra_pred.h
@@ -0,0 +1,410 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_intra_pred.h
+*
+* @brief
+* Declarations for the fucntions defined in ihevc_intra_pred_filters
+*
+* @author
+* Mamatha
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_INTRA_PRED_H_
+#define IHEVC_INTRA_PRED_H_
+
+
+/*****************************************************************************/
+/* Macro definitions */
+/*****************************************************************************/
+#define look_up_trailing_zeros(x) (0 == (x) ? 8 : CTZ(x))
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+typedef void ihevc_intra_pred_luma_planar_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+
+typedef void ihevc_intra_pred_luma_dc_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_luma_horz_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_ver_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode2_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_18_34_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_3_to_9_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_11_to_17_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_19_to_25_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+
+typedef void ihevc_intra_pred_luma_mode_27_to_33_ft(
+ UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void ihevc_intra_pred_luma_ref_substitution_ft(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd);
+
+
+typedef void ihevc_intra_pred_luma_ref_subst_all_avlble_ft(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd);
+
+typedef void ihevc_intra_pred_ref_filtering_ft(UWORD8 *pu1_src,
+ WORD32 nt,
+ UWORD8 *pu1_dst,
+ WORD32 mode,
+ WORD32 strong_intra_smoothing_enable_flag);
+
+typedef void ihevc_hbd_intra_pred_luma_planar_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_dc_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_horz_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_ver_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode2_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_18_34_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_3_to_9_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_11_to_17_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_intra_pred_luma_mode_19_to_25_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+
+
+typedef void ihevc_hbd_intra_pred_luma_mode_27_to_33_ft(
+ UWORD16 *pu2_ref,
+ WORD32 src_strd,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode,
+ UWORD8 bit_depth);
+
+typedef void ihevc_hbd_intra_pred_luma_ref_substitution_ft(UWORD16 *pu2_top_left,
+ UWORD16 *pu2_top,
+ UWORD16 *pu2_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD16 *pu2_dst,
+ WORD32 dst_strd,
+ UWORD8 bit_depth);
+
+
+
+typedef void ihevc_hbd_intra_pred_ref_filtering_ft(UWORD16 *pu2_src,
+ WORD32 nt,
+ UWORD16 *pu2_dst,
+ WORD32 mode,
+ WORD32 strong_intra_smoothing_enable_flag,
+ UWORD8 bit_depth);
+
+/* C function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution;
+ihevc_intra_pred_luma_ref_subst_all_avlble_ft ihevc_intra_pred_luma_ref_subst_all_avlble;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering;
+
+ihevc_hbd_intra_pred_luma_planar_ft ihevc_hbd_intra_pred_luma_planar;
+ihevc_hbd_intra_pred_luma_dc_ft ihevc_hbd_intra_pred_luma_dc;
+ihevc_hbd_intra_pred_luma_horz_ft ihevc_hbd_intra_pred_luma_horz;
+ihevc_hbd_intra_pred_luma_ver_ft ihevc_hbd_intra_pred_luma_ver;
+ihevc_hbd_intra_pred_luma_mode2_ft ihevc_hbd_intra_pred_luma_mode2;
+ihevc_hbd_intra_pred_luma_mode_18_34_ft ihevc_hbd_intra_pred_luma_mode_18_34;
+ihevc_hbd_intra_pred_luma_mode_3_to_9_ft ihevc_hbd_intra_pred_luma_mode_3_to_9;
+ihevc_hbd_intra_pred_luma_mode_11_to_17_ft ihevc_hbd_intra_pred_luma_mode_11_to_17;
+ihevc_hbd_intra_pred_luma_mode_19_to_25_ft ihevc_hbd_intra_pred_luma_mode_19_to_25;
+ihevc_hbd_intra_pred_luma_mode_27_to_33_ft ihevc_hbd_intra_pred_luma_mode_27_to_33;
+ihevc_hbd_intra_pred_luma_ref_substitution_ft ihevc_hbd_intra_pred_luma_ref_substitution;
+ihevc_hbd_intra_pred_ref_filtering_ft ihevc_hbd_intra_pred_ref_filtering;
+
+
+/* A9Q function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_a9q;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_a9q;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_a9q;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_a9q;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_a9q;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_a9q;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_a9q;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_a9q;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_a9q;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_a9q;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_a9q;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_a9q;
+
+/* A9 A function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_a9a;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_a9a;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_a9a;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_a9a;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_a9a;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_a9a;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_a9a;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_a9a;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_a9a;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_a9a;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_a9a;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_a9a;
+
+/* NEONINTR function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_neonintr;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_neonintr;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_neonintr;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_neonintr;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_neonintr;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_neonintr;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_neonintr;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_neonintr;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_neonintr;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_neonintr;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_neonintr;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_neonintr;
+
+/* SSSE31 function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_ssse3;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_ssse3;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_ssse3;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_ssse3;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_ssse3;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_ssse3;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_ssse3;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_ssse3;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_ssse3;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_ssse3;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_ssse3;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_ssse3;
+
+/* SSE42 function declarations */
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_sse42;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_sse42;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_sse42;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_sse42;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_sse42;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_sse42;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_sse42;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_sse42;
+ihevc_hbd_intra_pred_luma_planar_ft ihevc_hbd_intra_pred_luma_planar_sse42;
+ihevc_hbd_intra_pred_luma_dc_ft ihevc_hbd_intra_pred_luma_dc_sse42;
+ihevc_hbd_intra_pred_luma_horz_ft ihevc_hbd_intra_pred_luma_horz_sse42;
+ihevc_hbd_intra_pred_luma_ver_ft ihevc_hbd_intra_pred_luma_ver_sse42;
+ihevc_hbd_intra_pred_luma_mode2_ft ihevc_hbd_intra_pred_luma_mode2_sse42;
+ihevc_hbd_intra_pred_luma_mode_18_34_ft ihevc_hbd_intra_pred_luma_mode_18_34_sse42;
+ihevc_hbd_intra_pred_luma_mode_3_to_9_ft ihevc_hbd_intra_pred_luma_mode_3_to_9_sse42;
+ihevc_hbd_intra_pred_luma_mode_11_to_17_ft ihevc_hbd_intra_pred_luma_mode_11_to_17_sse42;
+ihevc_hbd_intra_pred_luma_mode_19_to_25_ft ihevc_hbd_intra_pred_luma_mode_19_to_25_sse42;
+ihevc_hbd_intra_pred_luma_mode_27_to_33_ft ihevc_hbd_intra_pred_luma_mode_27_to_33_sse42;
+ihevc_hbd_intra_pred_luma_ref_substitution_ft ihevc_hbd_intra_pred_luma_ref_substitution_sse42;
+ihevc_hbd_intra_pred_ref_filtering_ft ihevc_hbd_intra_pred_ref_filtering_sse42;
+
+/* AVX function declaration*/
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_avx;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_avx;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_avx;
+
+ihevc_hbd_intra_pred_luma_dc_ft ihevc_hbd_intra_pred_luma_dc_avx;
+ihevc_hbd_intra_pred_luma_mode_18_34_ft ihevc_hbd_intra_pred_luma_mode_18_34_avx;
+ihevc_hbd_intra_pred_luma_ver_ft ihevc_hbd_intra_pred_luma_ver_avx;
+ihevc_hbd_intra_pred_ref_filtering_ft ihevc_hbd_intra_pred_ref_filtering_avx;
+
+/* armv8 function declarations */
+ihevc_intra_pred_luma_planar_ft ihevc_intra_pred_luma_planar_av8;
+ihevc_intra_pred_luma_dc_ft ihevc_intra_pred_luma_dc_av8;
+ihevc_intra_pred_luma_horz_ft ihevc_intra_pred_luma_horz_av8;
+ihevc_intra_pred_luma_ver_ft ihevc_intra_pred_luma_ver_av8;
+ihevc_intra_pred_luma_mode2_ft ihevc_intra_pred_luma_mode2_av8;
+ihevc_intra_pred_luma_mode_18_34_ft ihevc_intra_pred_luma_mode_18_34_av8;
+ihevc_intra_pred_luma_mode_3_to_9_ft ihevc_intra_pred_luma_mode_3_to_9_av8;
+ihevc_intra_pred_luma_mode_11_to_17_ft ihevc_intra_pred_luma_mode_11_to_17_av8;
+ihevc_intra_pred_luma_mode_19_to_25_ft ihevc_intra_pred_luma_mode_19_to_25_av8;
+ihevc_intra_pred_luma_mode_27_to_33_ft ihevc_intra_pred_luma_mode_27_to_33_av8;
+ihevc_intra_pred_luma_ref_substitution_ft ihevc_intra_pred_luma_ref_substitution_av8;
+ihevc_intra_pred_ref_filtering_ft ihevc_intra_pred_ref_filtering_av8;
+#endif /* IHEVC_INTRA_PRED_H_ */
diff --git a/common/ihevc_intra_pred_filters.c b/common/ihevc_intra_pred_filters.c
new file mode 100644
index 0000000..d6bc2ab
--- /dev/null
+++ b/common/ihevc_intra_pred_filters.c
@@ -0,0 +1,1553 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_intra_pred_filters.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_intra_pred_luma_planar()
+* - ihevc_intra_pred_luma_dc()
+* - ihevc_intra_pred_luma_horz()
+* - ihevc_intra_pred_luma_ver()
+* - ihevc_intra_pred_luma_mode2()
+* - ihevc_intra_pred_luma_mode_18_34()
+* - ihevc_intra_pred_luma_mode_3_to_9()
+* - ihevc_intra_pred_luma_mode_11_to_17()
+* - ihevc_intra_pred_luma_mode_19_to_25()
+* - ihevc_intra_pred_luma_mode_27_to_33()
+* - ihevc_intra_pred_luma_ref_substitution()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_debug.h"
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/*****************************************************************************/
+/* global tables Definition */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for pu1_ref substitution
+*
+*
+* @par Description:
+* Reference substitution process for samples unavailable for prediction
+* Refer to section 8.4.4.2.2
+*
+* @param[in] pu1_top_left
+* UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+* UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+* UWORD8 pointer to the left
+*
+* @param[in] src_strd
+* WORD32 Source stride
+*
+* @param[in] nbr_flags
+* WORD32 neighbor availability flags
+*
+* @param[in] nt
+* WORD32 transform Block size
+*
+* @param[in] dst_strd
+* WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_intra_pred_luma_ref_subst_all_avlble(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd)
+{
+
+ WORD32 i;
+ WORD32 two_nt = 2 * nt;
+ UNUSED(nbr_flags);
+ UNUSED(dst_strd);
+
+ /* Neighbor Flag Structure*/
+ /* MSB ---> LSB */
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ 1 4 4 4 4
+ */
+ ASSERT((nbr_flags == 0x11188) || (nbr_flags == 0x133CC) || (nbr_flags == 0x1FFFF));
+ {
+
+ if(nt == 4)
+ {
+ /* 1 bit extraction for all the neighboring blocks */
+
+
+#if 1
+ /* Else fill the corresponding samples */
+ pu1_dst[two_nt] = *pu1_top_left;
+ //if(left)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+// if(bot_left)
+ {
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+// if(top)
+ {
+ ihevc_memcpy(&pu1_dst[two_nt + 1], pu1_top, nt);
+ }
+// if(tp_right)
+ {
+ ihevc_memcpy(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+ }
+
+#endif
+
+ }
+ else
+
+ {
+
+#if 1
+ /* Else fill the corresponding samples */
+ ASSERT((nt == 8) || (nt == 16) || (nt == 32));
+ pu1_dst[two_nt] = *pu1_top_left;
+
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+
+ ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1], pu1_top, nt);
+
+ ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+#endif
+ }
+
+ }
+}
+
+
+void ihevc_intra_pred_luma_ref_substitution(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd)
+{
+ UWORD8 pu1_ref;
+ WORD32 dc_val, i;
+ WORD32 total_samples = (4 * nt) + 1;
+ WORD32 two_nt = 2 * nt;
+
+ WORD32 three_nt = 3 * nt;
+ WORD32 get_bits;
+ WORD32 next;
+ WORD32 bot_left, left, top, tp_right, tp_left;
+
+ WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+ UNUSED(dst_strd);
+ /*dc_val = 1 << (BIT_DEPTH - 1);*/
+ dc_val = 1 << (8 - 1);
+
+
+ /* Neighbor Flag Structure*/
+ /* MSB ---> LSB */
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ 1 4 4 4 4
+ */
+ /* If no neighbor flags are present, fill the neighbor samples with DC value */
+ if(nbr_flags == 0)
+ {
+ for(i = 0; i < total_samples; i++)
+ {
+ pu1_dst[i] = dc_val;
+ }
+ }
+ else
+ {
+ if(nt <= 8)
+ {
+ /* 1 bit extraction for all the neighboring blocks */
+ tp_left = (nbr_flags & 0x10000) >> 16;
+ bot_left = (nbr_flags & 0x8) >> 3;
+ left = (nbr_flags & 0x80) >> 7;
+ top = (nbr_flags & 0x100) >> 8;
+ tp_right = (nbr_flags & 0x1000) >> 12;
+
+#if 1
+ /* Else fill the corresponding samples */
+ if(tp_left)
+ pu1_dst[two_nt] = *pu1_top_left;
+ else
+ pu1_dst[two_nt] = 0;
+
+
+ if(left)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+ }
+
+
+ if(bot_left)
+ {
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+ }
+
+
+ if(top)
+ {
+ ihevc_memcpy(&pu1_dst[two_nt + 1], pu1_top, nt);
+ }
+ else
+ {
+ ihevc_memset(&pu1_dst[two_nt + 1], 0, nt);
+ }
+
+ if(tp_right)
+ {
+ ihevc_memcpy(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+ }
+ else
+ {
+ ihevc_memset(&pu1_dst[two_nt + 1 + nt], 0, nt);
+ }
+#endif
+ next = 1;
+
+ /* If bottom -left is not available, reverse substitution process*/
+ if(bot_left == 0)
+ {
+ WORD32 a_nbr_flag[5];
+ a_nbr_flag[0] = bot_left;
+ a_nbr_flag[1] = left;
+ a_nbr_flag[2] = tp_left;
+ a_nbr_flag[3] = top;
+ a_nbr_flag[4] = tp_right;
+
+ /* Check for the 1st available sample from bottom-left*/
+ while(!a_nbr_flag[next])
+ next++;
+
+ /* If Left, top-left are available*/
+ if(next <= 2)
+ {
+ idx = nt * next;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ else /* If top, top-right are available */
+ {
+ /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+ idx = (nt * (next - 1)) + 1;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* Forward Substitution Process */
+ /* If left is Unavailable, copy the last bottom-left value */
+ if(left == 0)
+ {
+ ihevc_memset(&pu1_dst[nt], pu1_dst[nt - 1], nt);
+
+ }
+ /* If top-left is Unavailable, copy the last left value */
+ if(tp_left == 0)
+ pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+ /* If top is Unavailable, copy the last top-left value */
+ if(top == 0)
+ {
+ ihevc_memset(&pu1_dst[two_nt + 1], pu1_dst[two_nt], nt);
+ }
+ /* If to right is Unavailable, copy the last top value */
+ if(tp_right == 0)
+ {
+ ihevc_memset(&pu1_dst[three_nt + 1], pu1_dst[three_nt], nt);
+
+ }
+ }
+
+ if(nt == 16)
+ {
+ WORD32 nbr_flags_temp = 0;
+ nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+ + ((nbr_flags & 0x300) >> 4)
+ + ((nbr_flags & 0x3000) >> 6)
+ + ((nbr_flags & 0x10000) >> 8);
+
+#if 1
+ /* Else fill the corresponding samples */
+ if(nbr_flags & 0x10000)
+ pu1_dst[two_nt] = *pu1_top_left;
+ else
+ pu1_dst[two_nt] = 0;
+
+ if(nbr_flags & 0xC0)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+ }
+
+ if(nbr_flags & 0xC)
+ {
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+ }
+
+
+ if(nbr_flags & 0x300)
+ {
+ ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1], pu1_top, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt + 1], 0, nt);
+ }
+
+ if(nbr_flags & 0x3000)
+ {
+ ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt + 1 + nt], 0, nt);
+ }
+#endif
+ /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ nbr_id_from_bl = 32;
+
+ if(nbr_id_from_bl == 32)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags_temp >> 8) & 0x1))
+ {
+ nbr_id_from_bl++;
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
+ //nbr_id_from_bl += idx * 8;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ {
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T16_4NT)+1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T16_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags_temp, 8);
+
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ ihevc_memset_mul_8(pu1_dst + nbr_id_from_bl, pu1_ref, 8);
+
+
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+ }
+
+
+ }
+
+ if(nt == 32)
+ {
+#if 1
+ /* Else fill the corresponding samples */
+ if(nbr_flags & 0x10000)
+ pu1_dst[two_nt] = *pu1_top_left;
+ else
+ pu1_dst[two_nt] = 0;
+
+ if(nbr_flags & 0xF0)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (nt - 1)], 0, nt);
+ }
+
+ if(nbr_flags & 0xF)
+ {
+ for(i = nt; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt - 1 - (two_nt - 1)], 0, nt);
+ }
+
+
+ if(nbr_flags & 0xF00)
+ {
+ ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1], pu1_top, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt + 1], 0, nt);
+ }
+
+ if(nbr_flags & 0xF000)
+ {
+ ihevc_memcpy_mul_8(&pu1_dst[two_nt + 1 + nt], pu1_top + nt, nt);
+ }
+ else
+ {
+ ihevc_memset_mul_8(&pu1_dst[two_nt + 1 + nt], 0, nt);
+ }
+#endif
+ /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags >> 16) & 0x1))
+ {
+ /* top left not available */
+ nbr_id_from_bl++;
+ /* top and top right; 8 pels per nbr bit */
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T32_4NT)+1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T32_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags, 16);
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ ihevc_memset_mul_8(&pu1_dst[nbr_id_from_bl], pu1_ref, 8);
+
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+ }
+ }
+
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for ref_filtering
+*
+*
+* @par Description:
+* Reference DC filtering for neighboring samples dependent on TU size and
+* mode Refer to section 8.4.4.2.3 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_ref_filtering(UWORD8 *pu1_src,
+ WORD32 nt,
+ UWORD8 *pu1_dst,
+ WORD32 mode,
+ WORD32 strong_intra_smoothing_enable_flag)
+{
+ WORD32 filter_flag;
+ WORD32 i; /* Generic indexing variable */
+ WORD32 four_nt = 4 * nt;
+ UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
+ WORD32 bi_linear_int_flag = 0;
+ WORD32 abs_cond_left_flag = 0;
+ WORD32 abs_cond_top_flag = 0;
+ /*WORD32 dc_val = 1 << (BIT_DEPTH - 5);*/
+ WORD32 dc_val = 1 << (8 - 5);
+ //WORD32 strong_intra_smoothing_enable_flag = 1;
+
+ filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+ if(0 == filter_flag)
+ {
+ if(pu1_src == pu1_dst)
+ {
+ return;
+ }
+ else
+ {
+ for(i = 0; i < (four_nt + 1); i++)
+ pu1_dst[i] = pu1_src[i];
+ }
+ }
+
+ else
+ {
+ /* If strong intra smoothin is enabled and transform size is 32 */
+ if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+ {
+ /* Strong Intra Filtering */
+ abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
+ - (2 * pu1_src[3 * nt]))) < dc_val;
+ abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
+ - (2 * pu1_src[nt]))) < dc_val;
+
+ bi_linear_int_flag = ((1 == abs_cond_left_flag)
+ && (1 == abs_cond_top_flag));
+ }
+ /* Extremities Untouched*/
+ au1_flt[0] = pu1_src[0];
+ au1_flt[4 * nt] = pu1_src[4 * nt];
+
+ /* Strong filtering of reference samples */
+ if(1 == bi_linear_int_flag)
+ {
+ au1_flt[2 * nt] = pu1_src[2 * nt];
+
+ for(i = 1; i < (2 * nt); i++)
+ au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
+
+ for(i = 1; i < (2 * nt); i++)
+ au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
+
+ }
+ else
+ {
+ /* Perform bilinear filtering of Reference Samples */
+ for(i = 0; i < (four_nt - 1); i++)
+ {
+ au1_flt[i + 1] = (pu1_src[i] + 2 * pu1_src[i + 1]
+ + pu1_src[i + 2] + 2) >> 2;
+ }
+ }
+
+
+ for(i = 0; i < (four_nt + 1); i++)
+ pu1_dst[i] = au1_flt[i];
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma planar
+*
+* @par Description:
+* Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_planar(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+
+ WORD32 row, col;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+ UNUSED(src_strd);
+ UNUSED(mode);
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+ /* Planar filtering */
+ for(row = 0; row < nt; row++)
+ {
+ for(col = 0; col < nt; col++)
+ {
+ pu1_dst[row * dst_strd + col] = ((nt - 1 - col)
+ * pu1_ref[two_nt - 1 - row]
+ + (col + 1) * pu1_ref[three_nt + 1]
+ + (nt - 1 - row) * pu1_ref[two_nt + 1 + col]
+ + (row + 1) * pu1_ref[nt - 1] + nt) >> (log2nt + 1);
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_dc(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 acc_dc;
+ WORD32 dc_val, two_dc_val, three_dc_val;
+ WORD32 i;
+ WORD32 row, col;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+ UNUSED(mode);
+ UNUSED(src_strd);
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+
+ acc_dc = 0;
+ /* Calculate DC value for the transform block */
+ for(i = nt; i < two_nt; i++)
+ acc_dc += pu1_ref[i];
+
+ for(i = (two_nt + 1); i <= three_nt; i++)
+ acc_dc += pu1_ref[i];
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ two_dc_val = 2 * dc_val;
+ three_dc_val = 3 * dc_val;
+
+
+ if(nt == 32)
+ {
+ for(row = 0; row < nt; row++)
+ for(col = 0; col < nt; col++)
+ pu1_dst[(row * dst_strd) + col] = dc_val;
+ }
+ else
+ {
+ /* DC filtering for the first top row and first left column */
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(col = 1; col < nt; col++)
+ pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2;
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+
+ /* Fill the remaining rows with DC value*/
+ for(row = 1; row < nt; row++)
+ for(col = 1; col < nt; col++)
+ pu1_dst[(row * dst_strd) + col] = dc_val;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for horizontal luma variable.
+*
+* @par Description:
+* Horizontal intraprediction(mode 10) with reference samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_horz(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 two_nt;
+ WORD16 s2_predpixel;
+ UNUSED(mode);
+ UNUSED(src_strd);
+ two_nt = 2 * nt;
+
+ if(nt == 32)
+ {
+ for(row = 0; row < nt; row++)
+ for(col = 0; col < nt; col++)
+ pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];
+ }
+ else
+ {
+ /*Filtering done for the 1st row */
+ for(col = 0; col < nt; col++)
+ {
+ s2_predpixel = pu1_ref[two_nt - 1]
+ + ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1);
+ pu1_dst[col] = CLIP_U8(s2_predpixel);
+ }
+
+ /* Replication to next rows*/
+ for(row = 1; row < nt; row++)
+ for(col = 0; col < nt; col++)
+ pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];
+ }
+}
+
+
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+* Horizontal intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_ver(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD16 s2_predpixel;
+ WORD32 two_nt = 2 * nt;
+ UNUSED(mode);
+ UNUSED(src_strd);
+
+ if(nt == 32)
+ {
+ /* Replication to next columns*/
+ for(row = 0; row < nt; row++)
+ for(col = 0; col < nt; col++)
+ pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt + 1 + col];
+ }
+ else
+ {
+ /*Filtering done for the 1st column */
+ for(row = 0; row < nt; row++)
+ {
+ s2_predpixel = pu1_ref[two_nt + 1]
+ + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
+ pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
+ }
+
+ /* Replication to next columns*/
+ for(row = 0; row < nt; row++)
+ for(col = 1; col < nt; col++)
+ pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt + 1 + col];
+ }
+}
+
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode2.
+*
+* @par Description:
+* Intraprediction for mode 2 (sw angle) with reference neighboring samples
+* location pointed by 'pu1_ref' to the TU block location pointed by
+* 'pu1_dst' Refer to section 8.4.4.2.6 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode2(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 two_nt = 2 * nt;
+ WORD32 intra_pred_ang = 32;
+ WORD32 idx = 0;
+ UNUSED(mode);
+ UNUSED(src_strd);
+ /* For the angle 45, replication is done from the corresponding angle */
+ /* intra_pred_ang = tan(angle) in q5 format */
+ for(col = 0; col < nt; col++)
+ {
+ idx = ((col + 1) * intra_pred_ang) >> 5; /* Use idx++ */
+
+ for(row = 0; row < nt; row++)
+ pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt - row - idx - 1];
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 18 & mode 34.
+*
+* @par Description:
+* Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_18_34(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 intra_pred_ang;
+ WORD32 idx = 0;
+ WORD32 two_nt = 2 * nt;
+ UNUSED(src_strd);
+ intra_pred_ang = 32; /*Default value*/
+
+ /* For mode 18, angle is -45degree */
+ if(mode == 18)
+ intra_pred_ang = -32;
+ /* For mode 34, angle is 45degree */
+ else if(mode == 34)
+ intra_pred_ang = 32;
+ /* For the angle 45 and -45, replication is done from the corresponding angle */
+ /* No interpolation is done for 45 degree*/
+ for(row = 0; row < nt; row++)
+ {
+ idx = ((row + 1) * intra_pred_ang) >> 5;
+#if OPT
+ if(mode == 18)
+ idx--;
+ if(mode == 34)
+ idx++;
+#endif
+ for(col = 0; col < nt; col++)
+ pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1];
+
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 3 to mode 9
+*
+* @par Description:
+* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_3_to_9(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 two_nt = 2 * nt;
+ WORD32 intra_pred_ang;
+ WORD32 idx, ref_main_idx;
+ WORD32 pos, fract;
+ UNUSED(src_strd);
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ for(col = 0; col < nt; col++)
+ {
+ pos = ((col + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+
+ // Do linear filtering
+ for(row = 0; row < nt; row++)
+ {
+ ref_main_idx = two_nt - row - idx - 1;
+ pu1_dst[col + (row * dst_strd)] = (((32 - fract)
+ * pu1_ref[ref_main_idx]
+ + fract * pu1_ref[ref_main_idx - 1] + 16) >> 5);
+ }
+
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 11 to mode 17
+*
+* @par Description:
+* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
+* with reference neighboring samples location pointed by 'pu1_ref' to the
+* TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_11_to_17(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
+ /* for ref main & side samples assignment,can be combined for */
+ /* optimzation*/
+
+ WORD32 row, col, k;
+ WORD32 two_nt;
+ WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+ WORD32 idx, ref_main_idx, ref_idx;
+ WORD32 pos, fract;
+
+ UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
+ UWORD8 *ref_main;
+ UNUSED(src_strd);
+ inv_ang_sum = 128;
+ two_nt = 2 * nt;
+
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+ ref_main = ref_temp + nt - 1;
+ for(k = 0; k < nt + 1; k++)
+ ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
+
+ ref_main = ref_temp + nt - 1;
+ ref_idx = (nt * intra_pred_ang) >> 5;
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+ }
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+ for(col = 0; col < nt; col++)
+ {
+ pos = ((col + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+
+ // Do linear filtering
+ for(row = 0; row < nt; row++)
+ {
+ ref_main_idx = row + idx + 1;
+ pu1_dst[col + (dst_strd * row)] = (UWORD8)(((32 - fract)
+ * ref_main[ref_main_idx]
+ + fract * ref_main[ref_main_idx + 1] + 16) >> 5);
+
+ }
+
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 19 to mode 25
+*
+* @par Description:
+* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_19_to_25(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col, k;
+ WORD32 two_nt, intra_pred_ang, idx;
+ WORD32 inv_ang, inv_ang_sum, pos, fract;
+ WORD32 ref_main_idx, ref_idx;
+ UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
+ UWORD8 *ref_main;
+ UNUSED(src_strd);
+ two_nt = 2 * nt;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_main = ref_temp + nt - 1;
+ for(k = 0; k < (nt + 1); k++)
+ ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
+
+ ref_idx = (nt * intra_pred_ang) >> 5;
+ inv_ang_sum = 128;
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+ }
+
+ for(row = 0; row < nt; row++)
+ {
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+
+ // Do linear filtering
+ for(col = 0; col < nt; col++)
+ {
+ ref_main_idx = col + idx + 1;
+ pu1_dst[(row * dst_strd) + col] = (UWORD8)(((32 - fract)
+ * ref_main[ref_main_idx]
+ + fract * ref_main[ref_main_idx + 1] + 16) >> 5);
+
+ }
+
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 27 to mode 33
+*
+* @par Description:
+* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_27_to_33(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 two_nt, pos, fract;
+ WORD32 intra_pred_ang;
+ WORD32 idx, ref_main_idx;
+ UNUSED(src_strd);
+ two_nt = 2 * nt;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ for(row = 0; row < nt; row++)
+ {
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+
+ // Do linear filtering
+ for(col = 0; col < nt; col++)
+ {
+ ref_main_idx = two_nt + col + idx + 1;
+ pu1_dst[col + (row * dst_strd)] = (((32 - fract)
+ * pu1_ref[ref_main_idx]
+ + fract * pu1_ref[ref_main_idx + 1] + 16) >> 5);
+ }
+
+ }
+
+}
+
diff --git a/common/ihevc_iquant_itrans_recon.c b/common/ihevc_iquant_itrans_recon.c
new file mode 100644
index 0000000..249aa56
--- /dev/null
+++ b/common/ihevc_iquant_itrans_recon.c
@@ -0,0 +1,456 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_iquant_itrans_recon.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_iquant_itrans_recon_4x4_ttype1()
+ * - ihevc_iquant_itrans_recon_4x4()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc_itrans.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform
+ * type1(DST) and reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform type 1 and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @param[in] zero_rows
+ * Zero Rows in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_itrans_recon_4x4_ttype1(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ UNUSED(zero_rows);
+ /* Inverse Quant and Inverse Transform and Reconstruction */
+ {
+ WORD32 i, c[4];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 2;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_4;
+ pi2_tmp_orig = pi2_tmp;
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ WORD32 iq_tmp_1, iq_tmp_2, iq_tmp_3;
+ // Intermediate Variables
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[0 * src_strd],
+ pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[2 * src_strd],
+ pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ c[0] = iq_tmp_1 + iq_tmp_2;
+
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[2 * src_strd],
+ pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[3 * src_strd],
+ pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ c[1] = iq_tmp_1 + iq_tmp_2;
+
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[0 * src_strd],
+ pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[3 * src_strd],
+ pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ c[2] = iq_tmp_1 - iq_tmp_2;
+
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[1 * src_strd],
+ pi2_dequant_coeff[1 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ c[3] = 74 * iq_tmp_1;
+
+ pi2_tmp[0] =
+ CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+ pi2_tmp[1] =
+ CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[0 * src_strd],
+ pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[2 * src_strd],
+ pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_3,
+ pi2_src[3 * src_strd],
+ pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+
+ pi2_tmp[2] =
+ CLIP_S16((74 * (iq_tmp_1 - iq_tmp_2 + iq_tmp_3) + add) >> shift);
+ pi2_tmp[3] =
+ CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ for(i = 0; i < trans_size; i++)
+ {
+ WORD32 itrans_out;
+
+ // Intermediate Variables
+ c[0] = pi2_tmp[0] + pi2_tmp[2 * trans_size];
+ c[1] = pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size];
+ c[2] = pi2_tmp[0] - pi2_tmp[3 * trans_size];
+ c[3] = 74 * pi2_tmp[trans_size];
+
+ itrans_out =
+ CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+ pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+
+ itrans_out =
+ CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+ pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+
+ itrans_out =
+ CLIP_S16((74 * (pi2_tmp[0] - pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size]) + add) >> shift);
+ pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+
+ itrans_out =
+ CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+ pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @param[in] zero_rows
+ * Zero Rows in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_itrans_recon_4x4(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ UNUSED(zero_rows);
+ /* Inverse Transform */
+ {
+ WORD32 j;
+ WORD32 e[2], o[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 2;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_4;
+ pi2_tmp_orig = pi2_tmp;
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ WORD32 iq_tmp_1, iq_tmp_2;
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[1 * src_strd],
+ pi2_dequant_coeff[1 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[3 * src_strd],
+ pi2_dequant_coeff[3 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+
+ o[0] = g_ai2_ihevc_trans_4[1][0] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[3][0] * iq_tmp_2;
+ o[1] = g_ai2_ihevc_trans_4[1][1] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[3][1] * iq_tmp_2;
+
+ IQUANT_4x4(iq_tmp_1,
+ pi2_src[0 * src_strd],
+ pi2_dequant_coeff[0 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ IQUANT_4x4(iq_tmp_2,
+ pi2_src[2 * src_strd],
+ pi2_dequant_coeff[2 * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+
+ e[0] = g_ai2_ihevc_trans_4[0][0] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[2][0] * iq_tmp_2;
+ e[1] = g_ai2_ihevc_trans_4[0][1] * iq_tmp_1
+ + g_ai2_ihevc_trans_4[2][1] * iq_tmp_2;
+
+ pi2_tmp[0] =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pi2_tmp[1] =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pi2_tmp[2] =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pi2_tmp[3] =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 itrans_out;
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][0]
+ * pi2_tmp[3 * trans_size];
+ o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][1]
+ * pi2_tmp[3 * trans_size];
+ e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][0]
+ * pi2_tmp[2 * trans_size];
+ e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][1]
+ * pi2_tmp[2 * trans_size];
+
+ itrans_out =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+
+ itrans_out =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+
+ itrans_out =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+
+ itrans_out =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+ pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+
+ }
+ }
+}
diff --git a/common/ihevc_iquant_itrans_recon.h b/common/ihevc_iquant_itrans_recon.h
new file mode 100644
index 0000000..33055b4
--- /dev/null
+++ b/common/ihevc_iquant_itrans_recon.h
@@ -0,0 +1,197 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_iquant_itrans_recon.h
+*
+* @brief
+* Functions declarations for inverse quantization, inverse transform and
+* reconstruction
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_IQUANT_ITRANS_RECON_H_
+#define _IHEVC_IQUANT_ITRANS_RECON_H_
+
+typedef void ihevc_iquant_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+typedef void ihevc_iquant_itrans_recon_32x32_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void ihevc_hbd_iquant_itrans_recon_32x32_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+ihevc_iquant_itrans_recon_4x4_ttype1_ft ihevc_iquant_itrans_recon_4x4_ttype1;
+ihevc_hbd_iquant_itrans_recon_4x4_ttype1_ft ihevc_hbd_iquant_itrans_recon_4x4_ttype1;
+ihevc_iquant_itrans_recon_4x4_ft ihevc_iquant_itrans_recon_4x4;
+ihevc_hbd_iquant_itrans_recon_4x4_ft ihevc_hbd_iquant_itrans_recon_4x4;
+ihevc_iquant_itrans_recon_8x8_ft ihevc_iquant_itrans_recon_8x8;
+ihevc_hbd_iquant_itrans_recon_8x8_ft ihevc_hbd_iquant_itrans_recon_8x8;
+ihevc_iquant_itrans_recon_16x16_ft ihevc_iquant_itrans_recon_16x16;
+ihevc_hbd_iquant_itrans_recon_16x16_ft ihevc_hbd_iquant_itrans_recon_16x16;
+ihevc_iquant_itrans_recon_32x32_ft ihevc_iquant_itrans_recon_32x32;
+ihevc_hbd_iquant_itrans_recon_32x32_ft ihevc_hbd_iquant_itrans_recon_32x32;
+
+ihevc_iquant_itrans_recon_4x4_ttype1_ft ihevc_iquant_itrans_recon_4x4_ttype1_sse42;
+ihevc_iquant_itrans_recon_4x4_ft ihevc_iquant_itrans_recon_4x4_sse42;
+ihevc_iquant_itrans_recon_8x8_ft ihevc_iquant_itrans_recon_8x8_sse42;
+ihevc_iquant_itrans_recon_16x16_ft ihevc_iquant_itrans_recon_16x16_sse42;
+ihevc_iquant_itrans_recon_32x32_ft ihevc_iquant_itrans_recon_32x32_sse42;
+
+ihevc_hbd_iquant_itrans_recon_4x4_ttype1_ft ihevc_hbd_iquant_itrans_recon_4x4_ttype1_sse42;
+ihevc_hbd_iquant_itrans_recon_4x4_ft ihevc_hbd_iquant_itrans_recon_4x4_sse42;
+ihevc_hbd_iquant_itrans_recon_8x8_ft ihevc_hbd_iquant_itrans_recon_8x8_sse42;
+ihevc_hbd_iquant_itrans_recon_16x16_ft ihevc_hbd_iquant_itrans_recon_16x16_sse42;
+ihevc_hbd_iquant_itrans_recon_32x32_ft ihevc_hbd_iquant_itrans_recon_32x32_sse42;
+#endif /*_IHEVC_IQUANT_ITRANS_RECON_H_*/
+
diff --git a/common/ihevc_iquant_recon.c b/common/ihevc_iquant_recon.c
new file mode 100644
index 0000000..de5ff53
--- /dev/null
+++ b/common/ihevc_iquant_recon.c
@@ -0,0 +1,612 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_iquant_recon.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization and
+ * reconstruction
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_iquant_recon_4x4_ttype1()
+ * - ihevc_iquant_recon_4x4()
+ * - ihevc_iquant_recon_8x8()
+ * - ihevc_iquant_recon_16x16()
+ * - ihevc_iquant_recon_32x32()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc_iquant_itrans_recon.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization type 1 and reconstruction
+ * for 4x4 input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 4x4
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_4x4_ttype1(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 2;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_4;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT_4x4(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred++;
+ pu1_dst++;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization and reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 4x4
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_4x4(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 2;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_4;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT_4x4(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred++;
+ pu1_dst++;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 8x8
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_8x8(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 3;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_8;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred++;
+ pu1_dst++;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization and reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 16x16
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_16x16(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 4;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_16;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred++;
+ pu1_dst++;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization and reconstruction for 32x32
+ * input block
+ *
+ * @par Description:
+ * This function performs inverse quantization and reconstruction for 32x32
+ * input block
+ *
+ * @param[in] pi2_src
+ * Input 32x32 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 32x32 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 32x32 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_iquant_recon_32x32(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+
+ {
+ /* Inverse Quant and recon */
+ {
+ WORD32 i, j;
+ WORD32 shift_iq;
+ WORD32 trans_size;
+ /* Inverse Quantization constants */
+ {
+ WORD32 log2_trans_size, bit_depth;
+
+ log2_trans_size = 5;
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+
+ trans_size = TRANS_SIZE_32;
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 iquant_out;
+ IQUANT(iquant_out,
+ pi2_src[j * src_strd],
+ pi2_dequant_coeff[j * trans_size] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ iquant_out = (iquant_out + 16) >> 5;
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(iquant_out + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pi2_dequant_coeff++;
+ pu1_pred++;
+ pu1_dst++;
+
+ zero_cols = zero_cols >> 1;
+ }
+ }
+ }
+}
+
diff --git a/common/ihevc_iquant_recon.h b/common/ihevc_iquant_recon.h
new file mode 100644
index 0000000..c732b04
--- /dev/null
+++ b/common/ihevc_iquant_recon.h
@@ -0,0 +1,154 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_iquant_recon.h
+*
+* @brief
+* Functions declarations for inverse quantization and reconstruction
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_IQUANT_RECON_H_
+#define _IHEVC_IQUANT_RECON_H_
+
+typedef void ihevc_iquant_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_iquant_recon_32x32_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD8 *pu1_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_iquant_recon_32x32_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ WORD16 *pi2_dequant_coeff,
+ UWORD16 *pu2_dst,
+ WORD32 qp_div, /* qpscaled / 6 */
+ WORD32 qp_rem, /* qpscaled % 6 */
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+
+ihevc_iquant_recon_4x4_ttype1_ft ihevc_iquant_recon_4x4_ttype1;
+ihevc_hbd_iquant_recon_4x4_ttype1_ft ihevc_hbd_iquant_recon_4x4_ttype1;
+ihevc_iquant_recon_4x4_ft ihevc_iquant_recon_4x4;
+ihevc_hbd_iquant_recon_4x4_ft ihevc_hbd_iquant_recon_4x4;
+ihevc_iquant_recon_8x8_ft ihevc_iquant_recon_8x8;
+ihevc_hbd_iquant_recon_8x8_ft ihevc_hbd_iquant_recon_8x8;
+ihevc_iquant_recon_16x16_ft ihevc_iquant_recon_16x16;
+ihevc_hbd_iquant_recon_16x16_ft ihevc_hbd_iquant_recon_16x16;
+ihevc_iquant_recon_32x32_ft ihevc_iquant_recon_32x32;
+ihevc_hbd_iquant_recon_32x32_ft ihevc_hbd_iquant_recon_32x32;
+
+#endif /*_IHEVC_IQUANT_RECON_H_*/
diff --git a/common/ihevc_itrans.c b/common/ihevc_itrans.c
new file mode 100644
index 0000000..741c2ab
--- /dev/null
+++ b/common/ihevc_itrans.c
@@ -0,0 +1,974 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans.c
+ *
+ * @brief
+ * Contains function definitions for single stage inverse transform
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_4x4_ttype1()
+ * - ihevc_itrans_4x4()
+ * - ihevc_itrans_8x8()
+ * - ihevc_itrans_16x16()
+ * - ihevc_itrans_32x32()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#define NON_OPTIMIZED 1
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Single stage Inverse transform type 1 (DST) for
+ * 4x4 input block
+ *
+ * @par Description:
+ * Performs single stage 4x4 inverse transform type 1 by utilizing the
+ * symmetry of transformation matrix and reducing number of multiplications
+ * wherever possible but keeping the number of operations
+ * (addition,multiplication and shift)same
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[out] pi2_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] i4_shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_4x4_ttype1(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ WORD32 i, c[4];
+ WORD32 add;
+
+ add = 1 << (i4_shift - 1);
+
+ for(i = 0; i < TRANS_SIZE_4; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
+ }
+ else
+ {
+ // Intermediate Variables
+ c[0] = pi2_src[0] + pi2_src[2 * src_strd];
+ c[1] = pi2_src[2 * src_strd] + pi2_src[3 * src_strd];
+ c[2] = pi2_src[0] - pi2_src[3 * src_strd];
+ c[3] = 74 * pi2_src[src_strd];
+
+ pi2_dst[0] =
+ CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> i4_shift);
+ pi2_dst[1] =
+ CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> i4_shift);
+ pi2_dst[2] =
+ CLIP_S16((74 * (pi2_src[0] - pi2_src[2 * src_strd] + pi2_src[3 * src_strd]) + add) >> i4_shift);
+ pi2_dst[3] =
+ CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> i4_shift);
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Single stage Inverse transform for 4x4 input
+ * block
+ *
+ * @par Description:
+ * Performs single stage 4x4 inverse transform by utilizing the symmetry of
+ * transformation matrix and reducing number of multiplications wherever
+ * possible but keeping the number of operations(addition,multiplication and
+ * shift) same
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[out] pi2_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] i4_shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#if NON_OPTIMIZED
+void ihevc_itrans_4x4(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ WORD32 j;
+ WORD32 e[2], o[2];
+ WORD32 add;
+
+ add = 1 << (i4_shift - 1);
+
+ for(j = 0; j < TRANS_SIZE_4; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
+ }
+ else
+ {
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
+ o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
+ e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
+ e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
+
+ pi2_dst[0] =
+ CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
+ pi2_dst[1] =
+ CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
+ pi2_dst[2] =
+ CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
+ pi2_dst[3] =
+ CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
+
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+}
+#else
+void ihevc_itrans_4x4(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ WORD32 j;
+ WORD32 e[2], o[2];
+ WORD32 add;
+
+ add = 1 << (i4_shift - 1);
+
+ /***************************************************************************/
+ /* Transform Matrix 4x4 */
+ /* 0 1 2 3 */
+ /* 0 { 64, 64, 64, 64}, */
+ /* 1 { 83, 36,-36,-83}, */
+ /* 2 { 64,-64,-64, 64}, */
+ /* 3 { 36,-83, 83,-36} */
+ /***************************************************************************/
+
+ for(j = 0; j < TRANS_SIZE_4; j++)
+ {
+ WORD32 temp;
+
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
+ }
+ else
+ {
+ /* Common operation in o[0] and o[1] */
+ temp = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 36;
+
+ o[0] = temp + 47 * pi2_src[src_strd];
+ o[1] = temp - 119 * pi2_src[3 * src_strd];
+ e[0] = (pi2_src[0] + pi2_src[2 * src_strd]) << 6;
+ e[1] = (pi2_src[0] - pi2_src[2 * src_strd]) << 6;
+
+ pi2_dst[0] =
+ CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
+ pi2_dst[1] =
+ CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
+ pi2_dst[2] =
+ CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
+ pi2_dst[3] =
+ CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+}
+#endif
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Single stage Inverse transform for 8x8 input
+ * block
+ *
+ * @par Description:
+ * Performs single stage 8x8 inverse transform by utilizing the symmetry of
+ * transformation matrix and reducing number of multiplications wherever
+ * possible but keeping the number of operations(addition,multiplication and
+ * shift) same
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[out] pi2_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] i4_shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#if NON_OPTIMIZED
+void ihevc_itrans_8x8(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ WORD32 j, k;
+ WORD32 e[4], o[4];
+ WORD32 ee[2], eo[2];
+ WORD32 add;
+
+ add = 1 << (i4_shift - 1);
+
+ for(j = 0; j < TRANS_SIZE_8; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_8[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_8[7][k]
+ * pi2_src[7 * src_strd];
+ }
+
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ pi2_dst[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+ pi2_dst[k + 4] =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
+ }
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+#else
+void ihevc_itrans_8x8(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ /* Transform Matrix 8x8 */
+ /* 0 1 2 3 4 5 6 7 */
+ /* 0 - 64 64 64 64 64 64 64 64 */
+ /* 1 - 89 75 50 18 -18 -50 -75 -89 */
+ /* 2 - 83 36 -36 -83 -83 -36 36 83 */
+ /* 3 - 75 -18 -89 -50 50 89 18 -75 */
+ /* 4 - 64 -64 -64 64 64 -64 -64 64 */
+ /* 5 - 50 -89 18 75 -75 -18 89 -50 */
+ /* 6 - 36 -83 83 -36 -36 83 -83 36 */
+ /* 7 - 18 -50 75 -89 89 -75 50 -18 */
+
+ /* 0th and 4th row will have no multiplications */
+ /* 2nd and 6th row has only two coefff multiplies */
+ /* 1st, 3rd, 5th and 7th rows have o mirror symmetry */
+ WORD32 j, k;
+ WORD32 temp1, temp2;
+ WORD32 e[4], o[4];
+ WORD32 ee[2], eo[2];
+ WORD32 add;
+
+ add = 1 << (i4_shift - 1);
+
+ for(j = 0; j < TRANS_SIZE_8; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
+ }
+ else
+ {
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ /*
+ o[0] = 89 *pi2_src[8] + 75 *pi2_src[3*8] + 50 *pi2_src[5*8] + 18 *pi2_src[7*8];
+ o[1] = 75 *pi2_src[8] + -18 *pi2_src[3*8] + -89 *pi2_src[5*8] + -50 *pi2_src[7*8];
+ o[2] = 50 *pi2_src[8] + -89 *pi2_src[3*8] + 18 *pi2_src[5*8] + 75 *pi2_src[7*8];
+ o[3] = 18 *pi2_src[8] + -50 *pi2_src[3*8] + 75 *pi2_src[5*8] + -89 *pi2_src[7*8];
+ */
+
+ /* Optimization: 4 mul + 2 add ---> 3 mul + 3 add */
+ /*
+ temp1 = (pi2_src[8 ] + pi2_src[3*8]) * 75;
+ temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 50;
+
+ o[0] = temp1 + 14 * pi2_src[8 ] + temp2 - 32 * pi2_src[7*8];
+ o[1] = temp1 - 93 * pi2_src[3*8] - temp2 - 39 * pi2_src[5*8];
+ */
+
+ temp1 = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 75;
+ temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 50;
+
+ o[0] = temp1 + 14 * pi2_src[src_strd] + temp2
+ - (pi2_src[7 * src_strd] << 5);
+ o[1] = temp1 - 93 * pi2_src[3 * src_strd] - temp2
+ - 39 * pi2_src[5 * src_strd];
+
+ /* Optimization: 4 mul + 2 add ---> 3 mul + 3 add */
+ /*
+ temp1 = (pi2_src[8 ] - pi2_src[3*8]) * 50;
+ temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 75;
+
+ o[2] = temp1 - 39 * pi2_src[3*8] + temp2 - 57 * pi2_src[5*8];
+ o[3] = temp1 - 32 * pi2_src[8 ] + temp2 - 164 * pi2_src[7*8];
+ */
+
+ temp1 = (pi2_src[src_strd] - pi2_src[3 * src_strd]) * 50;
+ temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 75;
+
+ o[2] = temp1 - 39 * pi2_src[3 * src_strd] + temp2
+ - 57 * pi2_src[5 * src_strd];
+ o[3] = temp1 - (pi2_src[src_strd] << 5) + temp2
+ - 164 * pi2_src[7 * src_strd];
+
+ /*
+ eo[0] = 83 *pi2_src[ 2*8 ] + 36 *pi2_src[ 6*8 ];
+ eo[1] = 36 *pi2_src[ 2*8 ] + -83 *pi2_src[ 6*8 ];
+ ee[0] = 64 *pi2_src[ 0 ] + 64 *pi2_src[ 4*8 ];
+ ee[1] = 64 *pi2_src[ 0 ] + -64 *pi2_src[ 4*8 ];
+ */
+
+ /* Optimization: 4 mul + 2 add ---> 3 mul + 3 add */
+ temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 36;
+ eo[0] = temp1 + 47 * pi2_src[2 * src_strd];
+ eo[1] = temp1 - 119 * pi2_src[6 * src_strd];
+
+ /* Optimization: 4 mul + 2 add ---> 2 i4_shift + 2 add */
+ ee[0] = (pi2_src[0] + pi2_src[4 * src_strd]) << 6;
+ ee[1] = (pi2_src[0] - pi2_src[4 * src_strd]) << 6;
+
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+
+ for(k = 0; k < 4; k++)
+ {
+ pi2_dst[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+ pi2_dst[k + 4] =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
+ }
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+
+}
+#endif
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Single stage Inverse transform for 16x16 input
+ * block
+ *
+ * @par Description:
+ * Performs single stage 16x16 inverse transform by utilizing the symmetry
+ * of transformation matrix and reducing number of multiplications wherever
+ * possible but keeping the number of operations (addition,multiplication
+ * and shift) same
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[out] pi2_dst
+ * Output 16x16 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] i4_shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#if NON_OPTIMIZED
+void ihevc_itrans_16x16(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ WORD32 j, k;
+ WORD32 e[8], o[8];
+ WORD32 ee[4], eo[4];
+ WORD32 eee[2], eeo[2];
+ WORD32 add;
+
+ add = 1 << (i4_shift - 1);
+
+ for(j = 0; j < TRANS_SIZE_16; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_src[7 * src_strd]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_src[9 * src_strd]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_src[11 * src_strd]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_src[13 * src_strd]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_src[15 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_src[6 * src_strd]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_src[10 * src_strd]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_src[14 * src_strd];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_src[12 * src_strd];
+ eee[0] =
+ g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_16[8][0]
+ * pi2_src[8
+ * src_strd];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_src[12 * src_strd];
+ eee[1] =
+ g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_16[8][1]
+ * pi2_src[8
+ * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_dst[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+ pi2_dst[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
+ }
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+}
+#else
+void ihevc_itrans_16x16(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ WORD32 j, k;
+ WORD32 e[8], o[8];
+ WORD32 ee[4], eo[4];
+ WORD32 eee[2], eeo[2];
+ WORD32 add;
+ WORD32 temp1, temp2;
+
+ add = 1 << (i4_shift - 1);
+ /***************************************************************************/
+ /* Transform Matrix 16x16 */
+ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
+ /* 0 { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, */
+ /* 1 { 90, 87, 80, 70, 57, 43, 25, 9, -9,-25,-43,-57,-70,-80,-87,-90}, */
+ /* 2 { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89}, */
+ /* 3 { 87, 57, 9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87}, */
+ /* 4 { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83}, */
+ /* 5 { 80, 9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80}, */
+ /* 6 { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75}, */
+ /* 7 { 70,-43,-87, 9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70}, */
+ /* 8 { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64}, */
+ /* 9 { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87, 9,-90, 25, 80,-57}, */
+ /* 10 { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50}, */
+ /* 11 { 43,-90, 57, 25,-87, 70, 9,-80, 80, -9,-70, 87,-25,-57, 90,-43}, */
+ /* 12 { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36}, */
+ /* 13 { 25,-70, 90,-80, 43, 9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25}, */
+ /* 14 { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18}, */
+ /* 15 { 9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9} */
+ /***************************************************************************/
+
+ for(j = 0; j < TRANS_SIZE_16; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ {
+ /*
+ o[k] = g_ai2_ihevc_trans_16[ 1][k]*pi2_src[ src_strd ] + g_ai2_ihevc_trans_16[ 3][k]*pi2_src[ 3*src_strd ] + g_ai2_ihevc_trans_16[ 5][k]*pi2_src[ 5*src_strd ] + g_ai2_ihevc_trans_16[ 7][k]*pi2_src[ 7*src_strd ] +
+ g_ai2_ihevc_trans_16[ 9][k]*pi2_src[ 9*src_strd ] + g_ai2_ihevc_trans_16[11][k]*pi2_src[11*src_strd ] + g_ai2_ihevc_trans_16[13][k]*pi2_src[13*src_strd ] + g_ai2_ihevc_trans_16[15][k]*pi2_src[15*src_strd ];
+ */
+
+ o[0] = 90 * pi2_src[src_strd] + 87 * pi2_src[3 * src_strd]
+ + 80 * pi2_src[5 * src_strd]
+ + 70 * pi2_src[7 * src_strd]
+ + 57 * pi2_src[9 * src_strd]
+ + 43 * pi2_src[11 * src_strd]
+ + 25 * pi2_src[13 * src_strd]
+ + 9 * pi2_src[15 * src_strd];
+
+ o[1] = 87 * pi2_src[src_strd] + 57 * pi2_src[3 * src_strd]
+ + 9 * pi2_src[5 * src_strd]
+ + -43 * pi2_src[7 * src_strd]
+ + -80 * pi2_src[9 * src_strd]
+ + -90 * pi2_src[11 * src_strd]
+ + -70 * pi2_src[13 * src_strd]
+ + -25 * pi2_src[15 * src_strd];
+
+ o[2] = 80 * pi2_src[src_strd] + 9 * pi2_src[3 * src_strd]
+ + -70 * pi2_src[5 * src_strd]
+ + -87 * pi2_src[7 * src_strd]
+ + -25 * pi2_src[9 * src_strd]
+ + 57 * pi2_src[11 * src_strd]
+ + 90 * pi2_src[13 * src_strd]
+ + 43 * pi2_src[15 * src_strd];
+
+ o[3] = 70 * pi2_src[src_strd] + -43 * pi2_src[3 * src_strd]
+ + -87 * pi2_src[5 * src_strd]
+ + 9 * pi2_src[7 * src_strd]
+ + 90 * pi2_src[9 * src_strd]
+ + 25 * pi2_src[11 * src_strd]
+ + -80 * pi2_src[13 * src_strd]
+ + -57 * pi2_src[15 * src_strd];
+
+ o[4] = 57 * pi2_src[src_strd] + -80 * pi2_src[3 * src_strd]
+ + -25 * pi2_src[5 * src_strd]
+ + 90 * pi2_src[7 * src_strd]
+ + -9 * pi2_src[9 * src_strd]
+ + -87 * pi2_src[11 * src_strd]
+ + 43 * pi2_src[13 * src_strd]
+ + 70 * pi2_src[15 * src_strd];
+
+ o[5] = 43 * pi2_src[src_strd] + -90 * pi2_src[3 * src_strd]
+ + 57 * pi2_src[5 * src_strd]
+ + 25 * pi2_src[7 * src_strd]
+ + -87 * pi2_src[9 * src_strd]
+ + 70 * pi2_src[11 * src_strd]
+ + 9 * pi2_src[13 * src_strd]
+ + -80 * pi2_src[15 * src_strd];
+
+ o[6] = 25 * pi2_src[src_strd] + -70 * pi2_src[3 * src_strd]
+ + 90 * pi2_src[5 * src_strd]
+ + -80 * pi2_src[7 * src_strd]
+ + 43 * pi2_src[9 * src_strd]
+ + 9 * pi2_src[11 * src_strd]
+ + -57 * pi2_src[13 * src_strd]
+ + 87 * pi2_src[15 * src_strd];
+
+ o[7] = 9 * pi2_src[src_strd] + -25 * pi2_src[3 * src_strd]
+ + 43 * pi2_src[5 * src_strd]
+ + -57 * pi2_src[7 * src_strd]
+ + 70 * pi2_src[9 * src_strd]
+ + -80 * pi2_src[11 * src_strd]
+ + 87 * pi2_src[13 * src_strd]
+ + -90 * pi2_src[15 * src_strd];
+ }
+ {
+ temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 75;
+ temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 50;
+ eo[0] = temp1 + 14 * pi2_src[2 * src_strd] + temp2
+ - (pi2_src[14 * src_strd] << 5);
+ eo[1] = temp1 - 93 * pi2_src[6 * src_strd] - temp2
+ - 39 * pi2_src[10 * src_strd];
+
+ temp1 = (pi2_src[2 * src_strd] - pi2_src[6 * src_strd]) * 50;
+ temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 75;
+ eo[2] = temp1 - 39 * pi2_src[6 * src_strd] + temp2
+ - 57 * pi2_src[10 * src_strd];
+ eo[3] = temp1 - (pi2_src[2 * src_strd] << 5) + temp2
+ - 164 * pi2_src[14 * src_strd];
+ }
+
+ temp1 = (pi2_src[4 * src_strd] + pi2_src[12 * src_strd]) * 36;
+ eeo[0] = temp1 + 47 * pi2_src[4 * src_strd];
+ eeo[1] = temp1 - 119 * pi2_src[12 * src_strd];
+
+ eee[0] = (pi2_src[0] + pi2_src[8 * src_strd]) << 6;
+ eee[1] = (pi2_src[0] - pi2_src[8 * src_strd]) << 6;
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_dst[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+ pi2_dst[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
+ }
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+}
+#endif
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Single stage Inverse transform for 32x32 input
+ * block
+ *
+ * @par Description:
+ * Performs single stage 32x32 inverse transform by utilizing the symmetry
+ * of transformation matrix and reducing number of multiplications wherever
+ * possible but keeping the number of operations (addition,multiplication
+ * and shift) same
+ *
+ * @param[in] pi2_src
+ * Input 32x32 coefficients
+ *
+ * @param[out] pi2_dst
+ * Output 32x32 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] i4_shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_32x32(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 i4_shift,
+ WORD32 zero_cols)
+{
+ WORD32 j, k;
+ WORD32 e[16], o[16];
+ WORD32 ee[8], eo[8];
+ WORD32 eee[4], eeo[4];
+ WORD32 eeee[2], eeeo[2];
+ WORD32 add;
+
+ add = 1 << (i4_shift - 1);
+
+ for(j = 0; j < TRANS_SIZE_32; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_dst, 0, TRANS_SIZE_32 * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_src[7 * src_strd]
+ + g_ai2_ihevc_trans_32[9][k]
+ * pi2_src[9 * src_strd]
+ + g_ai2_ihevc_trans_32[11][k]
+ * pi2_src[11 * src_strd]
+ + g_ai2_ihevc_trans_32[13][k]
+ * pi2_src[13 * src_strd]
+ + g_ai2_ihevc_trans_32[15][k]
+ * pi2_src[15 * src_strd]
+ + g_ai2_ihevc_trans_32[17][k]
+ * pi2_src[17 * src_strd]
+ + g_ai2_ihevc_trans_32[19][k]
+ * pi2_src[19 * src_strd]
+ + g_ai2_ihevc_trans_32[21][k]
+ * pi2_src[21 * src_strd]
+ + g_ai2_ihevc_trans_32[23][k]
+ * pi2_src[23 * src_strd]
+ + g_ai2_ihevc_trans_32[25][k]
+ * pi2_src[25 * src_strd]
+ + g_ai2_ihevc_trans_32[27][k]
+ * pi2_src[27 * src_strd]
+ + g_ai2_ihevc_trans_32[29][k]
+ * pi2_src[29 * src_strd]
+ + g_ai2_ihevc_trans_32[31][k]
+ * pi2_src[31 * src_strd];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_src[6 * src_strd]
+ + g_ai2_ihevc_trans_32[10][k]
+ * pi2_src[10 * src_strd]
+ + g_ai2_ihevc_trans_32[14][k]
+ * pi2_src[14 * src_strd]
+ + g_ai2_ihevc_trans_32[18][k]
+ * pi2_src[18 * src_strd]
+ + g_ai2_ihevc_trans_32[22][k]
+ * pi2_src[22 * src_strd]
+ + g_ai2_ihevc_trans_32[26][k]
+ * pi2_src[26 * src_strd]
+ + g_ai2_ihevc_trans_32[30][k]
+ * pi2_src[30 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_32[12][k]
+ * pi2_src[12 * src_strd]
+ + g_ai2_ihevc_trans_32[20][k]
+ * pi2_src[20 * src_strd]
+ + g_ai2_ihevc_trans_32[28][k]
+ * pi2_src[28 * src_strd];
+ }
+ eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
+ + g_ai2_ihevc_trans_32[24][0]
+ * pi2_src[24 * src_strd];
+ eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
+ + g_ai2_ihevc_trans_32[24][1]
+ * pi2_src[24 * src_strd];
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_32[16][0]
+ * pi2_src[16 * src_strd];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_32[16][1]
+ * pi2_src[16 * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ pi2_dst[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
+ pi2_dst[k + 16] =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> i4_shift));
+ }
+ }
+ pi2_src++;
+ pi2_dst += dst_strd;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
diff --git a/common/ihevc_itrans.h b/common/ihevc_itrans.h
new file mode 100644
index 0000000..38a38a5
--- /dev/null
+++ b/common/ihevc_itrans.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_itrans.h
+*
+* @brief
+* Functions declarations for inverse transform
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_ITRANS_H_
+#define _IHEVC_ITRANS_H_
+
+typedef void ihevc_itrans_4x4_ttype1_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_cols);
+typedef void ihevc_itrans_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_cols);
+typedef void ihevc_itrans_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_cols);
+typedef void ihevc_itrans_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_cols);
+typedef void ihevc_itrans_32x32_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_cols);
+
+/* C function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32;
+
+/* A9 Q function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_a9q;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_a9q;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_a9q;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_a9q;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_a9q;
+
+/* A9 Q function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_neonintr;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_neonintr;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_neonintr;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_neonintr;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_neonintr;
+
+/* SSSE3 function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_ssse3;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_ssse3;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_ssse3;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_ssse3;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_ssse3;
+
+/* SSE4.2 function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_sse42;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_sse42;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_sse42;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_sse42;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_sse42;
+
+/* armv8 function declarations */
+ihevc_itrans_4x4_ttype1_ft ihevc_itrans_4x4_ttype1_av8;
+ihevc_itrans_4x4_ft ihevc_itrans_4x4_av8;
+ihevc_itrans_8x8_ft ihevc_itrans_8x8_av8;
+ihevc_itrans_16x16_ft ihevc_itrans_16x16_av8;
+ihevc_itrans_32x32_ft ihevc_itrans_32x32_av8;
+#endif /*_IHEVC_ITRANS_H_*/
diff --git a/common/ihevc_itrans_recon.c b/common/ihevc_itrans_recon.c
new file mode 100644
index 0000000..0af96e8
--- /dev/null
+++ b/common/ihevc_itrans_recon.c
@@ -0,0 +1,333 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans_recon.c
+ *
+ * @brief
+ * Contains function definitions for inverse transform and reconstruction
+ *
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_4x4_ttype1()
+ * - ihevc_itrans_recon_4x4()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc_itrans.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform type 1 (DST) and reconstruction
+ * for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ *
+ * transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_ttype1(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 i, c[4];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ UNUSED(zero_rows);
+ trans_size = TRANS_SIZE_4;
+
+ pi2_tmp_orig = pi2_tmp;
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ // Intermediate Variables
+ c[0] = pi2_src[0] + pi2_src[2 * src_strd];
+ c[1] = pi2_src[2 * src_strd] + pi2_src[3 * src_strd];
+ c[2] = pi2_src[0] - pi2_src[3 * src_strd];
+ c[3] = 74 * pi2_src[src_strd];
+
+ pi2_tmp[0] =
+ CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+ pi2_tmp[1] =
+ CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+ pi2_tmp[2] =
+ CLIP_S16((74 * (pi2_src[0] - pi2_src[2 * src_strd] + pi2_src[3 * src_strd]) + add) >> shift);
+ pi2_tmp[3] =
+ CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ for(i = 0; i < trans_size; i++)
+ {
+ WORD32 itrans_out;
+ // Intermediate Variables
+ c[0] = pi2_tmp[0] + pi2_tmp[2 * trans_size];
+ c[1] = pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size];
+ c[2] = pi2_tmp[0] - pi2_tmp[3 * trans_size];
+ c[3] = 74 * pi2_tmp[trans_size];
+
+ itrans_out =
+ CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> shift);
+ pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+ itrans_out =
+ CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> shift);
+ pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+ itrans_out =
+ CLIP_S16((74 * (pi2_tmp[0] - pi2_tmp[2 * trans_size] + pi2_tmp[3 * trans_size]) + add) >> shift);
+ pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+ itrans_out =
+ CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> shift);
+ pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 4x4
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ *
+ * transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+
+{
+ WORD32 j;
+ WORD32 e[2], o[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ UNUSED(zero_rows);
+ trans_size = TRANS_SIZE_4;
+
+ pi2_tmp_orig = pi2_tmp;
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
+ o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
+ e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
+ e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
+
+ pi2_tmp[0] =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pi2_tmp[1] =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pi2_tmp[2] =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pi2_tmp[3] =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < trans_size; j++)
+ {
+ WORD32 itrans_out;
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][0] * pi2_tmp[3 * trans_size];
+ o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_4[3][1] * pi2_tmp[3 * trans_size];
+ e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][0] * pi2_tmp[2 * trans_size];
+ e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_4[2][1] * pi2_tmp[2 * trans_size];
+
+ itrans_out =
+ CLIP_S16(((e[0] + o[0] + add) >> shift));
+ pu1_dst[0] = CLIP_U8((itrans_out + pu1_pred[0]));
+ itrans_out =
+ CLIP_S16(((e[1] + o[1] + add) >> shift));
+ pu1_dst[1] = CLIP_U8((itrans_out + pu1_pred[1]));
+ itrans_out =
+ CLIP_S16(((e[1] - o[1] + add) >> shift));
+ pu1_dst[2] = CLIP_U8((itrans_out + pu1_pred[2]));
+ itrans_out =
+ CLIP_S16(((e[0] - o[0] + add) >> shift));
+ pu1_dst[3] = CLIP_U8((itrans_out + pu1_pred[3]));
+
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+
+ }
+}
+
diff --git a/common/ihevc_itrans_recon.h b/common/ihevc_itrans_recon.h
new file mode 100644
index 0000000..56da261
--- /dev/null
+++ b/common/ihevc_itrans_recon.h
@@ -0,0 +1,193 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_itrans_recon.h
+*
+* @brief
+* Functions declarations for inverse transform and reconstruction
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_ITRANS_RECON_H_
+#define _IHEVC_ITRANS_RECON_H_
+
+typedef void ihevc_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+typedef void ihevc_itrans_recon_32x32_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+typedef void ihevc_hbd_itrans_recon_32x32_ft(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows,
+ UWORD8 bit_depth);
+
+/* C function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32;
+
+ihevc_hbd_itrans_recon_4x4_ttype1_ft ihevc_hbd_itrans_recon_4x4_ttype1;
+ihevc_hbd_itrans_recon_4x4_ft ihevc_hbd_itrans_recon_4x4;
+ihevc_hbd_itrans_recon_8x8_ft ihevc_hbd_itrans_recon_8x8;
+ihevc_hbd_itrans_recon_16x16_ft ihevc_hbd_itrans_recon_16x16;
+ihevc_hbd_itrans_recon_32x32_ft ihevc_hbd_itrans_recon_32x32;
+
+/* A9 Q function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_a9q;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_a9q;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_a9q;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_a9q;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_a9q;
+
+/* A9 A function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_a9a;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_a9a;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_a9a;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_a9a;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_a9a;
+
+/* NEONINTR function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_neonintr;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_neonintr;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_neonintr;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_neonintr;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_neonintr;
+
+/* SSSE31 function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_ssse3;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_ssse3;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_ssse3;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_ssse3;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_ssse3;
+
+/* SSE42 function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_sse42;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_sse42;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_sse42;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_sse42;
+
+ihevc_hbd_itrans_recon_4x4_ttype1_ft ihevc_hbd_itrans_recon_4x4_ttype1_sse42;
+ihevc_hbd_itrans_recon_4x4_ft ihevc_hbd_itrans_recon_4x4_sse42;
+ihevc_hbd_itrans_recon_8x8_ft ihevc_hbd_itrans_recon_8x8_sse42;
+ihevc_hbd_itrans_recon_16x16_ft ihevc_hbd_itrans_recon_16x16_sse42;
+ihevc_hbd_itrans_recon_32x32_ft ihevc_hbd_itrans_recon_32x32_sse42;
+
+
+/* armv8 function declarations */
+ihevc_itrans_recon_4x4_ttype1_ft ihevc_itrans_recon_4x4_ttype1_av8;
+ihevc_itrans_recon_4x4_ft ihevc_itrans_recon_4x4_av8;
+ihevc_itrans_recon_8x8_ft ihevc_itrans_recon_8x8_av8;
+ihevc_itrans_recon_16x16_ft ihevc_itrans_recon_16x16_av8;
+ihevc_itrans_recon_32x32_ft ihevc_itrans_recon_32x32_av8;
+#endif /*_IHEVC_ITRANS_RECON_H_*/
diff --git a/common/ihevc_itrans_recon_16x16.c b/common/ihevc_itrans_recon_16x16.c
new file mode 100644
index 0000000..56e28a3
--- /dev/null
+++ b/common/ihevc_itrans_recon_16x16.c
@@ -0,0 +1,889 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans_recon_16x16.c
+ *
+ * @brief
+ * Contains function definitions for inverse transform and reconstruction 16x16
+ *
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_16x16()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 16x16
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 16x16 buffer for storing inverse
+ *
+ * transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_16x16(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 j, k;
+ WORD32 e[8], o[8];
+ WORD32 ee[4], eo[4];
+ WORD32 eee[2], eeo[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ WORD32 zero_rows_2nd_stage = zero_cols;
+ WORD32 row_limit_2nd_stage;
+
+ if((zero_cols & 0xFFF0) == 0xFFF0)
+ row_limit_2nd_stage = 4;
+ else if((zero_cols & 0xFF00) == 0xFF00)
+ row_limit_2nd_stage = 8;
+ else
+ row_limit_2nd_stage = TRANS_SIZE_16;
+
+ trans_size = TRANS_SIZE_16;
+ pi2_tmp_orig = pi2_tmp;
+ if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
+ {
+ /* Inverse Transform 1st stage */
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_src[3 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_tmp[15 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_tmp[14 * trans_size];
+ }
+ eeo[0] =
+ g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_tmp[12
+ * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+ eeo[1] =
+ g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_tmp[12
+ * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+ }
+ else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
+ {
+ /* Inverse Transform 1st stage */
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_src[7 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_src[6 * src_strd];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_tmp[15 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_tmp[14 * trans_size];
+ }
+ eeo[0] =
+ g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_tmp[12
+ * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+ eeo[1] =
+ g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_tmp[12
+ * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+ }
+ else /* All rows of input are non-zero */
+ {
+ /* Inverse Transform 1st stage */
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_src[7 * src_strd]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_src[9 * src_strd]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_src[11 * src_strd]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_src[13 * src_strd]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_src[15 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_src[6 * src_strd]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_src[10 * src_strd]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_src[14 * src_strd];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_src[12 * src_strd];
+ eee[0] =
+ g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_16[8][0]
+ * pi2_src[8
+ * src_strd];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_src[12 * src_strd];
+ eee[1] =
+ g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_16[8][1]
+ * pi2_src[8
+ * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 8] =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
+ }
+ eeo[0] = 0;
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = 0;
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
+ eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 8; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_16[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_16[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_16[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_16[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_16[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_16[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_16[15][k]
+ * pi2_tmp[15 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_16[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_16[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_16[14][k]
+ * pi2_tmp[14 * trans_size];
+ }
+ eeo[0] =
+ g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][0]
+ * pi2_tmp[12
+ * trans_size];
+ eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
+ eeo[1] =
+ g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_16[12][1]
+ * pi2_tmp[12
+ * trans_size];
+ eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ for(k = 0; k < 2; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 2] = eee[1 - k] - eeo[1 - k];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 4] = ee[3 - k] - eo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
+ pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_16x16****************************************/
+ /************************************************************************************************/
+ }
+
+}
+
diff --git a/common/ihevc_itrans_recon_32x32.c b/common/ihevc_itrans_recon_32x32.c
new file mode 100644
index 0000000..b8a71ab
--- /dev/null
+++ b/common/ihevc_itrans_recon_32x32.c
@@ -0,0 +1,1127 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans_recon_32x32.c
+ *
+ * @brief
+ * Contains function definitions for inverse transform and reconstruction 32x32
+ *
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_32x32()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 32x32
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 32x32 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 32x32 buffer for storing inverse
+ *
+ * transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 32x32 block
+ *
+ * @param[out] pu1_dst
+ * Output 32x32 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_32x32(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 j, k;
+ WORD32 e[16], o[16];
+ WORD32 ee[8], eo[8];
+ WORD32 eee[4], eeo[4];
+ WORD32 eeee[2], eeeo[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ WORD32 zero_rows_2nd_stage = zero_cols;
+ WORD32 row_limit_2nd_stage;
+
+ trans_size = TRANS_SIZE_32;
+ pi2_tmp_orig = pi2_tmp;
+
+ if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0)
+ row_limit_2nd_stage = 4;
+ else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00)
+ row_limit_2nd_stage = 8;
+ else
+ row_limit_2nd_stage = TRANS_SIZE_32;
+
+ if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_32x32****************************************/
+ /************************************************************************************************/
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_src[3 * src_strd];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd];
+ }
+// for(k = 0; k < 4; k++)
+ {
+ eeo[0] = 0;
+ eeo[1] = 0;
+ eeo[2] = 0;
+ eeo[3] = 0;
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 16] =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
+ }
+// for(k = 0; k < 4; k++)
+ {
+ eeo[0] = 0;
+ eeo[1] = 0;
+ eeo[2] = 0;
+ eeo[3] = 0;
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_32[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_32[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_32[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_32[15][k]
+ * pi2_tmp[15 * trans_size]
+ + g_ai2_ihevc_trans_32[17][k]
+ * pi2_tmp[17 * trans_size]
+ + g_ai2_ihevc_trans_32[19][k]
+ * pi2_tmp[19 * trans_size]
+ + g_ai2_ihevc_trans_32[21][k]
+ * pi2_tmp[21 * trans_size]
+ + g_ai2_ihevc_trans_32[23][k]
+ * pi2_tmp[23 * trans_size]
+ + g_ai2_ihevc_trans_32[25][k]
+ * pi2_tmp[25 * trans_size]
+ + g_ai2_ihevc_trans_32[27][k]
+ * pi2_tmp[27 * trans_size]
+ + g_ai2_ihevc_trans_32[29][k]
+ * pi2_tmp[29 * trans_size]
+ + g_ai2_ihevc_trans_32[31][k]
+ * pi2_tmp[31 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_32[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_32[14][k]
+ * pi2_tmp[14 * trans_size]
+ + g_ai2_ihevc_trans_32[18][k]
+ * pi2_tmp[18 * trans_size]
+ + g_ai2_ihevc_trans_32[22][k]
+ * pi2_tmp[22 * trans_size]
+ + g_ai2_ihevc_trans_32[26][k]
+ * pi2_tmp[26 * trans_size]
+ + g_ai2_ihevc_trans_32[30][k]
+ * pi2_tmp[30 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_32[12][k]
+ * pi2_tmp[12 * trans_size]
+ + g_ai2_ihevc_trans_32[20][k]
+ * pi2_tmp[20 * trans_size]
+ + g_ai2_ihevc_trans_32[28][k]
+ * pi2_tmp[28 * trans_size];
+ }
+ eeeo[0] =
+ g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
+ + g_ai2_ihevc_trans_32[24][0]
+ * pi2_tmp[24
+ * trans_size];
+ eeeo[1] =
+ g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
+ + g_ai2_ihevc_trans_32[24][1]
+ * pi2_tmp[24
+ * trans_size];
+ eeee[0] =
+ g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_32[16][0]
+ * pi2_tmp[16
+ * trans_size];
+ eeee[1] =
+ g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_32[16][1]
+ * pi2_tmp[16
+ * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_32x32****************************************/
+ /************************************************************************************************/
+ }
+ else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_32x32****************************************/
+ /************************************************************************************************/
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_src[7 * src_strd];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_src[6 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd];
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 16] =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
+ }
+// for(k = 0; k < 4; k++)
+ {
+ eeo[0] = 0;
+ eeo[1] = 0;
+ eeo[2] = 0;
+ eeo[3] = 0;
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_32[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_32[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_32[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_32[15][k]
+ * pi2_tmp[15 * trans_size]
+ + g_ai2_ihevc_trans_32[17][k]
+ * pi2_tmp[17 * trans_size]
+ + g_ai2_ihevc_trans_32[19][k]
+ * pi2_tmp[19 * trans_size]
+ + g_ai2_ihevc_trans_32[21][k]
+ * pi2_tmp[21 * trans_size]
+ + g_ai2_ihevc_trans_32[23][k]
+ * pi2_tmp[23 * trans_size]
+ + g_ai2_ihevc_trans_32[25][k]
+ * pi2_tmp[25 * trans_size]
+ + g_ai2_ihevc_trans_32[27][k]
+ * pi2_tmp[27 * trans_size]
+ + g_ai2_ihevc_trans_32[29][k]
+ * pi2_tmp[29 * trans_size]
+ + g_ai2_ihevc_trans_32[31][k]
+ * pi2_tmp[31 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_32[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_32[14][k]
+ * pi2_tmp[14 * trans_size]
+ + g_ai2_ihevc_trans_32[18][k]
+ * pi2_tmp[18 * trans_size]
+ + g_ai2_ihevc_trans_32[22][k]
+ * pi2_tmp[22 * trans_size]
+ + g_ai2_ihevc_trans_32[26][k]
+ * pi2_tmp[26 * trans_size]
+ + g_ai2_ihevc_trans_32[30][k]
+ * pi2_tmp[30 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_32[12][k]
+ * pi2_tmp[12 * trans_size]
+ + g_ai2_ihevc_trans_32[20][k]
+ * pi2_tmp[20 * trans_size]
+ + g_ai2_ihevc_trans_32[28][k]
+ * pi2_tmp[28 * trans_size];
+ }
+ eeeo[0] =
+ g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
+ + g_ai2_ihevc_trans_32[24][0]
+ * pi2_tmp[24
+ * trans_size];
+ eeeo[1] =
+ g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
+ + g_ai2_ihevc_trans_32[24][1]
+ * pi2_tmp[24
+ * trans_size];
+ eeee[0] =
+ g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_32[16][0]
+ * pi2_tmp[16
+ * trans_size];
+ eeee[1] =
+ g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_32[16][1]
+ * pi2_tmp[16
+ * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_32x32****************************************/
+ /************************************************************************************************/
+ }
+ else /* All rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_32x32****************************************/
+ /************************************************************************************************/
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_src[7 * src_strd]
+ + g_ai2_ihevc_trans_32[9][k]
+ * pi2_src[9 * src_strd]
+ + g_ai2_ihevc_trans_32[11][k]
+ * pi2_src[11 * src_strd]
+ + g_ai2_ihevc_trans_32[13][k]
+ * pi2_src[13 * src_strd]
+ + g_ai2_ihevc_trans_32[15][k]
+ * pi2_src[15 * src_strd]
+ + g_ai2_ihevc_trans_32[17][k]
+ * pi2_src[17 * src_strd]
+ + g_ai2_ihevc_trans_32[19][k]
+ * pi2_src[19 * src_strd]
+ + g_ai2_ihevc_trans_32[21][k]
+ * pi2_src[21 * src_strd]
+ + g_ai2_ihevc_trans_32[23][k]
+ * pi2_src[23 * src_strd]
+ + g_ai2_ihevc_trans_32[25][k]
+ * pi2_src[25 * src_strd]
+ + g_ai2_ihevc_trans_32[27][k]
+ * pi2_src[27 * src_strd]
+ + g_ai2_ihevc_trans_32[29][k]
+ * pi2_src[29 * src_strd]
+ + g_ai2_ihevc_trans_32[31][k]
+ * pi2_src[31 * src_strd];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_src[6 * src_strd]
+ + g_ai2_ihevc_trans_32[10][k]
+ * pi2_src[10 * src_strd]
+ + g_ai2_ihevc_trans_32[14][k]
+ * pi2_src[14 * src_strd]
+ + g_ai2_ihevc_trans_32[18][k]
+ * pi2_src[18 * src_strd]
+ + g_ai2_ihevc_trans_32[22][k]
+ * pi2_src[22 * src_strd]
+ + g_ai2_ihevc_trans_32[26][k]
+ * pi2_src[26 * src_strd]
+ + g_ai2_ihevc_trans_32[30][k]
+ * pi2_src[30 * src_strd];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
+ + g_ai2_ihevc_trans_32[12][k]
+ * pi2_src[12 * src_strd]
+ + g_ai2_ihevc_trans_32[20][k]
+ * pi2_src[20 * src_strd]
+ + g_ai2_ihevc_trans_32[28][k]
+ * pi2_src[28 * src_strd];
+ }
+ eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
+ + g_ai2_ihevc_trans_32[24][0]
+ * pi2_src[24 * src_strd];
+ eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
+ + g_ai2_ihevc_trans_32[24][1]
+ * pi2_src[24 * src_strd];
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_32[16][0]
+ * pi2_src[16 * src_strd];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_32[16][1]
+ * pi2_src[16 * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 16] =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
+ }
+// for(k = 0; k < 4; k++)
+ {
+ eeo[0] = 0;
+ eeo[1] = 0;
+ eeo[2] = 0;
+ eeo[3] = 0;
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_tmp[6 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
+ }
+ eeeo[0] = 0;
+ eeeo[1] = 0;
+ eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
+ eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 16; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_32[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_32[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_32[7][k]
+ * pi2_tmp[7 * trans_size]
+ + g_ai2_ihevc_trans_32[9][k]
+ * pi2_tmp[9 * trans_size]
+ + g_ai2_ihevc_trans_32[11][k]
+ * pi2_tmp[11 * trans_size]
+ + g_ai2_ihevc_trans_32[13][k]
+ * pi2_tmp[13 * trans_size]
+ + g_ai2_ihevc_trans_32[15][k]
+ * pi2_tmp[15 * trans_size]
+ + g_ai2_ihevc_trans_32[17][k]
+ * pi2_tmp[17 * trans_size]
+ + g_ai2_ihevc_trans_32[19][k]
+ * pi2_tmp[19 * trans_size]
+ + g_ai2_ihevc_trans_32[21][k]
+ * pi2_tmp[21 * trans_size]
+ + g_ai2_ihevc_trans_32[23][k]
+ * pi2_tmp[23 * trans_size]
+ + g_ai2_ihevc_trans_32[25][k]
+ * pi2_tmp[25 * trans_size]
+ + g_ai2_ihevc_trans_32[27][k]
+ * pi2_tmp[27 * trans_size]
+ + g_ai2_ihevc_trans_32[29][k]
+ * pi2_tmp[29 * trans_size]
+ + g_ai2_ihevc_trans_32[31][k]
+ * pi2_tmp[31 * trans_size];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_32[6][k]
+ * pi2_tmp[6 * trans_size]
+ + g_ai2_ihevc_trans_32[10][k]
+ * pi2_tmp[10 * trans_size]
+ + g_ai2_ihevc_trans_32[14][k]
+ * pi2_tmp[14 * trans_size]
+ + g_ai2_ihevc_trans_32[18][k]
+ * pi2_tmp[18 * trans_size]
+ + g_ai2_ihevc_trans_32[22][k]
+ * pi2_tmp[22 * trans_size]
+ + g_ai2_ihevc_trans_32[26][k]
+ * pi2_tmp[26 * trans_size]
+ + g_ai2_ihevc_trans_32[30][k]
+ * pi2_tmp[30 * trans_size];
+ }
+ for(k = 0; k < 4; k++)
+ {
+ eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
+ + g_ai2_ihevc_trans_32[12][k]
+ * pi2_tmp[12 * trans_size]
+ + g_ai2_ihevc_trans_32[20][k]
+ * pi2_tmp[20 * trans_size]
+ + g_ai2_ihevc_trans_32[28][k]
+ * pi2_tmp[28 * trans_size];
+ }
+ eeeo[0] =
+ g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
+ + g_ai2_ihevc_trans_32[24][0]
+ * pi2_tmp[24
+ * trans_size];
+ eeeo[1] =
+ g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
+ + g_ai2_ihevc_trans_32[24][1]
+ * pi2_tmp[24
+ * trans_size];
+ eeee[0] =
+ g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_32[16][0]
+ * pi2_tmp[16
+ * trans_size];
+ eeee[1] =
+ g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_32[16][1]
+ * pi2_tmp[16
+ * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ eee[0] = eeee[0] + eeeo[0];
+ eee[3] = eeee[0] - eeeo[0];
+ eee[1] = eeee[1] + eeeo[1];
+ eee[2] = eeee[1] - eeeo[1];
+ for(k = 0; k < 4; k++)
+ {
+ ee[k] = eee[k] + eeo[k];
+ ee[k + 4] = eee[3 - k] - eeo[3 - k];
+ }
+ for(k = 0; k < 8; k++)
+ {
+ e[k] = ee[k] + eo[k];
+ e[k + 8] = ee[7 - k] - eo[7 - k];
+ }
+ for(k = 0; k < 16; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
+ pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_32x32****************************************/
+ /************************************************************************************************/
+ }
+}
+
diff --git a/common/ihevc_itrans_recon_8x8.c b/common/ihevc_itrans_recon_8x8.c
new file mode 100644
index 0000000..5e2de86
--- /dev/null
+++ b/common/ihevc_itrans_recon_8x8.c
@@ -0,0 +1,414 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans_recon_8x8.c
+ *
+ * @brief
+ * Contains function definitions for inverse transform and reconstruction 8x8
+ *
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_8x8()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 8x8 buffer for storing inverse
+ *
+ * transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_8x8(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 j, k;
+ WORD32 e[4], o[4];
+ WORD32 ee[2], eo[2];
+ WORD32 add;
+ WORD32 shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 trans_size;
+ WORD32 zero_rows_2nd_stage = zero_cols;
+ WORD32 row_limit_2nd_stage;
+
+ trans_size = TRANS_SIZE_8;
+
+ pi2_tmp_orig = pi2_tmp;
+
+ if((zero_cols & 0xF0) == 0xF0)
+ row_limit_2nd_stage = 4;
+ else
+ row_limit_2nd_stage = TRANS_SIZE_8;
+
+
+ if((zero_rows & 0xF0) == 0xF0) /* First 4 rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_src[3 * src_strd];
+ }
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 4] =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
+ }
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_8[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_8[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+ }
+ else /* All rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+
+ /* Inverse Transform 1st stage */
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+
+ for(j = 0; j < row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_src[3 * src_strd]
+ + g_ai2_ihevc_trans_8[5][k]
+ * pi2_src[5 * src_strd]
+ + g_ai2_ihevc_trans_8[7][k]
+ * pi2_src[7 * src_strd];
+ }
+
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
+ + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
+ + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
+ + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pi2_tmp[k + 4] =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ if((zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_8[3][k] * pi2_tmp[3 * trans_size];
+ }
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_tmp[trans_size]
+ + g_ai2_ihevc_trans_8[3][k]
+ * pi2_tmp[3 * trans_size]
+ + g_ai2_ihevc_trans_8[5][k]
+ * pi2_tmp[5 * trans_size]
+ + g_ai2_ihevc_trans_8[7][k]
+ * pi2_tmp[7 * trans_size];
+ }
+
+ eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_8[6][0] * pi2_tmp[6 * trans_size];
+ eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_tmp[2 * trans_size]
+ + g_ai2_ihevc_trans_8[6][1] * pi2_tmp[6 * trans_size];
+ ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_8[4][0] * pi2_tmp[4 * trans_size];
+ ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_tmp[0]
+ + g_ai2_ihevc_trans_8[4][1] * pi2_tmp[4 * trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ e[0] = ee[0] + eo[0];
+ e[3] = ee[0] - eo[0];
+ e[1] = ee[1] + eo[1];
+ e[2] = ee[1] - eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((e[k] + o[k] + add) >> shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((e[3 - k] - o[3 - k] + add) >> shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += pred_strd;
+ pu1_dst += dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+ }
+}
+
diff --git a/common/ihevc_macros.h b/common/ihevc_macros.h
new file mode 100644
index 0000000..3852c85
--- /dev/null
+++ b/common/ihevc_macros.h
@@ -0,0 +1,89 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_macros.h
+*
+* @brief
+* Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_MACROS_H_
+#define _IHEVC_MACROS_H_
+
+#define RETURN_IF(cond, retval) if(cond) {return (retval);}
+#define UNUSED(x) ((void)(x))
+
+#define CLIP3(x, min, max) (((x) > max) ? max :(((x) < min)? min:(x)))
+
+#define MAX(x,y) ((((WORD32)x) > ((WORD32)y)) ? ((WORD32)x) :((WORD32)y))
+#define MIN(x,y) ((((WORD32)x) < ((WORD32)y)) ? ((WORD32)x) :((WORD32)y))
+#define SIGN(x) ((x) >= 0 ? ((x)>0 ? 1: 0) : -1)
+#define ABS(x) ((((WORD32)(x)) > 0) ? (x) : -(x))
+
+#define ALIGN128(x) ((((x) + 127) >> 7) << 7)
+#define ALIGN64(x) ((((x) + 63) >> 6) << 6)
+#define ALIGN32(x) ((((x) + 31) >> 5) << 5)
+#define ALIGN16(x) ((((x) + 15) >> 4) << 4)
+#define ALIGN8(x) ((((x) + 7) >> 3) << 3)
+
+#define ALIGN_POW2(ptr,align) ((((WORD32)ptr)+align-1)&(~(align-1)))
+
+/** Sets x bits to '1' starting from MSB */
+#define MSB_ONES(x) ((UWORD32)0xFFFFFFFF << (32 - (x)))
+
+/** Generates a pattern of x number of '01' in binary starting from MSB */
+#define DUP_MSB_01(x) ((UWORD32)0x55555555 << (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '10' in binary starting from MSB */
+#define DUP_MSB_10(x) ((UWORD32)0xAAAAAAAA << (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '11' in binary starting from MSB */
+#define DUP_MSB_11(x) ((UWORD32)0xFFFFFFFF << (32 - ((x) * 2)))
+
+/** Sets x bits to '1' starting from LSB */
+#define LSB_ONES(x) ((UWORD32)0xFFFFFFFF >> (32 - (x)))
+
+/** Generates a pattern of x number of '01' in binary starting from LSB */
+#define DUP_LSB_01(x) ((UWORD32)0x55555555 >> (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '10' in binary starting from LSB */
+#define DUP_LSB_10(x) ((UWORD32)0xAAAAAAAA >> (32 - ((x) * 2)))
+
+/** Generates a pattern of x number of '11' in binary starting from LSB */
+#define DUP_LSB_11(x) ((UWORD32)0xFFFFFFFF >> (32 - ((x) * 2)))
+
+/** Sets the bit in given position to 1 */
+#define BITSET(x, pos) ((x) | (1 << (pos)))
+
+/** Swap two variables */
+#define SWAP(X,Y) \
+{ \
+ (X) = (X) ^ (Y); \
+ (Y) = (X) ^ (Y); \
+ (X) = (X) ^ (Y); \
+}
+#endif /*_IHEVCD_MACROS_H_*/
diff --git a/common/ihevc_mem_fns.c b/common/ihevc_mem_fns.c
new file mode 100644
index 0000000..4a2227d
--- /dev/null
+++ b/common/ihevc_mem_fns.c
@@ -0,0 +1,166 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_mem_fns.c
+ *
+ * @brief
+ * Functions used for memory operations
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_mem_fns.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memcpy of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[in] num_bytes
+ * number of bytes to copy
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_memcpy(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+ memcpy(pu1_dst, pu1_src, num_bytes);
+}
+
+
+void ihevc_memcpy_mul_8(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+ memcpy(pu1_dst, pu1_src, num_bytes);
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ * number of bytes to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_memset(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+ memset(pu1_dst, value, num_bytes);
+}
+
+
+void ihevc_memset_mul_8(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+ memset(pu1_dst, value, num_bytes);
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ * number of words to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_memset_16bit(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+ UWORD32 i;
+ for(i = 0; i < num_words; i++)
+ {
+ *pu2_dst++ = value;
+ }
+}
+
+
+
+void ihevc_memset_16bit_mul_8(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+ UWORD32 i;
+ for(i = 0; i < num_words; i++)
+ {
+ *pu2_dst++ = value;
+ }
+}
+
diff --git a/common/ihevc_mem_fns.h b/common/ihevc_mem_fns.h
new file mode 100644
index 0000000..1b37e99
--- /dev/null
+++ b/common/ihevc_mem_fns.h
@@ -0,0 +1,132 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_mem_fns.h
+*
+* @brief
+* Function declarations used for memory functions
+*
+* @author
+* Naveen SR
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _MEM_FNS_H_
+#define _MEM_FNS_H_
+
+typedef void ihevc_memcpy_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes);
+
+typedef void ihevc_memcpy_mul_8_ft(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes);
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ * number of bytes to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+typedef void ihevc_memset_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes);
+
+typedef void ihevc_memset_mul_8_ft(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes);
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ * number of words to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+typedef void ihevc_memset_16bit_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words);
+
+typedef void ihevc_memset_16bit_mul_8_ft(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words);
+
+/* C function declarations */
+ihevc_memcpy_ft ihevc_memcpy;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8;
+ihevc_memset_ft ihevc_memset;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8;
+ihevc_memset_16bit_ft ihevc_memset_16bit;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8;
+
+/* A9 Q function declarations */
+ihevc_memcpy_ft ihevc_memcpy_a9q;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_a9q;
+ihevc_memset_ft ihevc_memset_a9q;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_a9q;
+ihevc_memset_16bit_ft ihevc_memset_16bit_a9q;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_a9q;
+
+/* A9 A function declarations */
+ihevc_memcpy_ft ihevc_memcpy_a9a;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_a9a;
+ihevc_memset_ft ihevc_memset_a9a;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_a9a;
+ihevc_memset_16bit_ft ihevc_memset_16bit_a9a;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_a9a;
+
+/* SSSE3 function declarations */
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_ssse3;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_ssse3;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_ssse3;
+
+/* armv8 function declarations */
+ihevc_memcpy_ft ihevc_memcpy_av8;
+ihevc_memcpy_mul_8_ft ihevc_memcpy_mul_8_av8;
+ihevc_memset_ft ihevc_memset_av8;
+ihevc_memset_mul_8_ft ihevc_memset_mul_8_av8;
+ihevc_memset_16bit_ft ihevc_memset_16bit_av8;
+ihevc_memset_16bit_mul_8_ft ihevc_memset_16bit_mul_8_av8;
+#endif //_MEM_FNS_H_
diff --git a/common/ihevc_padding.c b/common/ihevc_padding.c
new file mode 100644
index 0000000..dce8464
--- /dev/null
+++ b/common/ihevc_padding.c
@@ -0,0 +1,577 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_padding.c
+*
+* @brief
+* Contains function definitions for Padding
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_pad_horz_luma()
+* - ihevc_pad_horz_chroma()
+* - ihevc_pad_vert()
+* - ihevc_pad_left_luma()
+* - ihevc_pad_left_chroma()
+* - ihevc_pad_right_luma()
+* - ihevc_pad_right_chroma()
+* - ihevc_pad_top()
+* - ihevc_pad_bottom()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_mem_fns.h"
+/**
+*******************************************************************************
+*
+* @brief
+* Padding function for horizontal input variable
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_vert(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 1; row <= pad_size; row++)
+ {
+ memcpy(pu1_src - row * src_strd, pu1_src, wd);
+ memcpy(pu1_src + (ht + row - 1) * src_strd,
+ pu1_src + (ht - 1) * src_strd, wd);
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding function for vertical input variable
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_horz_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ //WORD32 col;
+ UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+ src_strd >>= 1;
+ wd >>= 1;
+ pad_size >>= 1;
+
+ for(row = 0; row < ht; row++)
+ {
+ UWORD16 u2_uv_val;
+
+ u2_uv_val = pu2_src[0];
+ ihevc_memset_16bit(&pu2_src[-pad_size], u2_uv_val, pad_size);
+
+ u2_uv_val = pu2_src[wd - 1];
+ ihevc_memset_16bit(&pu2_src[wd], u2_uv_val, pad_size);
+
+ pu2_src += src_strd;
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding function for vertical input variable
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_horz_luma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 0; row < ht; row++)
+ {
+ memset(pu1_src - pad_size, *pu1_src, pad_size);
+ memset(pu1_src + wd, *(pu1_src + wd - 1), pad_size);
+
+ pu1_src += src_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding at the top of a 2d array
+*
+* @par Description:
+* The top row of a 2d array is replicated for pad_size times at the top
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_top(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 1; row <= pad_size; row++)
+ {
+ memcpy(pu1_src - row * src_strd, pu1_src, wd);
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding at the bottom of a 2d array
+*
+* @par Description:
+* The bottom row of a 2d array is replicated for pad_size times at the bottom
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_bottom(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 1; row <= pad_size; row++)
+ {
+ memcpy(pu1_src + (row - 1) * src_strd,
+ pu1_src - 1 * src_strd, wd);
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_luma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 0; row < ht; row++)
+ {
+ memset(pu1_src - pad_size, *pu1_src, pad_size);
+
+ pu1_src += src_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array (each colour component)
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+ src_strd >>= 1;
+ pad_size >>= 1;
+
+ for(row = 0; row < ht; row++)
+ {
+ UWORD16 u2_uv_val;
+
+ u2_uv_val = pu2_src[0];
+ for(col = -pad_size; col < 0; col++)
+ pu2_src[col] = u2_uv_val;
+
+ pu2_src += src_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_luma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+
+ for(row = 0; row < ht; row++)
+ {
+ memset(pu1_src, *(pu1_src - 1), pad_size);
+
+ pu1_src += src_strd;
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array (each colour component)
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD16 *pu2_src = (UWORD16 *)pu1_src;
+
+ src_strd >>= 1;
+ pad_size >>= 1;
+
+ for(row = 0; row < ht; row++)
+ {
+ UWORD16 u2_uv_val;
+
+ u2_uv_val = pu2_src[-1];
+ for(col = 0; col < pad_size; col++)
+ pu2_src[col] = u2_uv_val;
+
+ pu2_src += src_strd;
+ }
+}
+
diff --git a/common/ihevc_padding.h b/common/ihevc_padding.h
new file mode 100644
index 0000000..349ac12
--- /dev/null
+++ b/common/ihevc_padding.h
@@ -0,0 +1,209 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_padding.h
+*
+* @brief
+* Declarations for the fucntions defined in ihevc_padding.c
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PADDING_H_
+#define _IHEVC_PADDING_H_
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+
+typedef void ihevc_pad_horz_luma_ft(
+ UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_horz_luma_ft(
+ UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_horz_chroma_ft(
+ UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_horz_chroma_ft(
+ UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_vert_ft(
+ UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_vert_ft(
+ UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_top_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_top_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_bottom_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_bottom_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 wd,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_left_luma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_left_luma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_left_chroma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_left_chroma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_right_luma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_right_luma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+typedef void ihevc_pad_right_chroma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+typedef void ihevc_hbd_pad_right_chroma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size);
+
+/* C function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma;
+ihevc_pad_vert_ft ihevc_pad_vert;
+ihevc_pad_top_ft ihevc_pad_top;
+ihevc_pad_bottom_ft ihevc_pad_bottom;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma;
+
+ihevc_hbd_pad_horz_luma_ft ihevc_hbd_pad_horz_luma;
+ihevc_hbd_pad_horz_chroma_ft ihevc_hbd_pad_horz_chroma;
+ihevc_hbd_pad_vert_ft ihevc_hbd_pad_vert;
+ihevc_hbd_pad_top_ft ihevc_hbd_pad_top;
+ihevc_hbd_pad_bottom_ft ihevc_hbd_pad_bottom;
+ihevc_hbd_pad_left_luma_ft ihevc_hbd_pad_left_luma;
+ihevc_hbd_pad_left_chroma_ft ihevc_hbd_pad_left_chroma;
+ihevc_hbd_pad_right_luma_ft ihevc_hbd_pad_right_luma;
+ihevc_hbd_pad_right_chroma_ft ihevc_hbd_pad_right_chroma;
+
+/* A9 Q function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_a9q;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_a9q;
+ihevc_pad_vert_ft ihevc_pad_vert_a9q;
+ihevc_pad_top_ft ihevc_pad_top_a9q;
+ihevc_pad_bottom_ft ihevc_pad_bottom_a9q;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_a9q;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_a9q;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_a9q;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_a9q;
+
+/* A9 a function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_a9a;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_a9a;
+ihevc_pad_vert_ft ihevc_pad_vert_a9a;
+ihevc_pad_top_ft ihevc_pad_top_a9a;
+ihevc_pad_bottom_ft ihevc_pad_bottom_a9a;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_a9a;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_a9a;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_a9a;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_a9a;
+
+/* NEONINTR function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_neonintr;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_neonintr;
+ihevc_pad_vert_ft ihevc_pad_vert_neonintr;
+ihevc_pad_top_ft ihevc_pad_top_neonintr;
+ihevc_pad_bottom_ft ihevc_pad_bottom_neonintr;
+/*SSSE3 functions declarations */
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_ssse3;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_ssse3;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_ssse3;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_ssse3;
+
+/* armv8 function declarations */
+ihevc_pad_horz_luma_ft ihevc_pad_horz_luma_av8;
+ihevc_pad_horz_chroma_ft ihevc_pad_horz_chroma_av8;
+ihevc_pad_vert_ft ihevc_pad_vert_av8;
+ihevc_pad_top_ft ihevc_pad_top_av8;
+ihevc_pad_bottom_ft ihevc_pad_bottom_av8;
+ihevc_pad_left_luma_ft ihevc_pad_left_luma_av8;
+ihevc_pad_left_chroma_ft ihevc_pad_left_chroma_av8;
+ihevc_pad_right_luma_ft ihevc_pad_right_luma_av8;
+ihevc_pad_right_chroma_ft ihevc_pad_right_chroma_av8;
+
+#endif /*_IHEVC_PADDING_H_*/
diff --git a/common/ihevc_quant_tables.c b/common/ihevc_quant_tables.c
new file mode 100644
index 0000000..10ccc0b
--- /dev/null
+++ b/common/ihevc_quant_tables.c
@@ -0,0 +1,471 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_trans_tables.c
+*
+* @brief
+* Contains tables used in forward and inverse quantization
+*
+* @author
+* 100189
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_quant_tables.h"
+#include "ihevc_defs.h"
+
+
+
+/** Default flat Scaling matrix for 32x32 transform
+ * Since the values are same, 32x32 matrix will be used for all sizes
+ */
+const WORD16 gi2_flat_scale_mat_32x32[] =
+{
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+
+};
+
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 8x8 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_scale_mat_8x8[] =
+{
+ 16, 16, 16, 16, 17, 18, 21, 24,
+ 16, 16, 16, 16, 17, 19, 22, 25,
+ 16, 16, 17, 18, 20, 22, 25, 29,
+ 16, 16, 18, 21, 24, 27, 31, 36,
+ 17, 17, 20, 24, 30, 35, 41, 47,
+ 18, 19, 22, 27, 35, 44, 54, 65,
+ 21, 22, 25, 31, 41, 54, 70, 88,
+ 24, 25, 29, 36, 47, 65, 88, 115
+};
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 8x8 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_scale_mat_8x8[] =
+{
+ 16, 16, 16, 16, 17, 18, 20, 24,
+ 16, 16, 16, 17, 18, 20, 24, 25,
+ 16, 16, 17, 18, 20, 24, 25, 28,
+ 16, 17, 18, 20, 24, 25, 28, 33,
+ 17, 18, 20, 24, 25, 28, 33, 41,
+ 18, 20, 24, 25, 28, 33, 41, 54,
+ 20, 24, 25, 28, 33, 41, 54, 71,
+ 24, 25, 28, 33, 41, 54, 71, 91
+};
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 16x16 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_scale_mat_16x16[] =
+{
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 21, 21, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 21, 21, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 19, 19, 22, 22, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 19, 19, 22, 22, 25, 25,
+ 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 22, 22, 25, 25, 29, 29,
+ 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 22, 22, 25, 25, 29, 29,
+ 16, 16, 16, 16, 18, 18, 21, 21, 24, 24, 27, 27, 31, 31, 36, 36,
+ 16, 16, 16, 16, 18, 18, 21, 21, 24, 24, 27, 27, 31, 31, 36, 36,
+ 17, 17, 17, 17, 20, 20, 24, 24, 30, 30, 35, 35, 41, 41, 47, 47,
+ 17, 17, 17, 17, 20, 20, 24, 24, 30, 30, 35, 35, 41, 41, 47, 47,
+ 18, 18, 19, 19, 22, 22, 27, 27, 35, 35, 44, 44, 54, 54, 65, 65,
+ 18, 18, 19, 19, 22, 22, 27, 27, 35, 35, 44, 44, 54, 54, 65, 65,
+ 21, 21, 22, 22, 25, 25, 31, 31, 41, 41, 54, 54, 70, 70, 88, 88,
+ 21, 21, 22, 22, 25, 25, 31, 31, 41, 41, 54, 54, 70, 70, 88, 88,
+ 24, 24, 25, 25, 29, 29, 36, 36, 47, 47, 65, 65, 88, 88, 115, 115,
+ 24, 24, 25, 25, 29, 29, 36, 36, 47, 47, 65, 65, 88, 88, 115, 115
+};
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 16x16 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_scale_mat_16x16[] =
+{
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24,
+ 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25,
+ 16, 16, 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25,
+ 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28,
+ 16, 16, 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28,
+ 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33,
+ 16, 16, 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33,
+ 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41,
+ 17, 17, 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41,
+ 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54,
+ 18, 18, 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54,
+ 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71,
+ 20, 20, 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71,
+ 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71, 91, 91,
+ 24, 24, 25, 25, 28, 28, 33, 33, 41, 41, 54, 54, 71, 71, 91, 91
+};
+
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 32x32 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_scale_mat_32x32[] =
+{
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 19, 19, 19, 19, 22, 22, 22, 22, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29, 29, 29, 29,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29, 29, 29, 29,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29, 29, 29, 29,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 25, 25, 25, 25, 29, 29, 29, 29,
+ 16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36, 36, 36, 36,
+ 16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36, 36, 36, 36,
+ 16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36, 36, 36, 36,
+ 16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 21, 21, 21, 21, 24, 24, 24, 24, 27, 27, 27, 27, 31, 31, 31, 31, 36, 36, 36, 36,
+ 17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47, 47, 47, 47,
+ 17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47, 47, 47, 47,
+ 17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47, 47, 47, 47,
+ 17, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 35, 35, 35, 35, 41, 41, 41, 41, 47, 47, 47, 47,
+ 18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65, 65, 65, 65,
+ 18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65, 65, 65, 65,
+ 18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65, 65, 65, 65,
+ 18, 18, 18, 18, 19, 19, 19, 19, 22, 22, 22, 22, 27, 27, 27, 27, 35, 35, 35, 35, 44, 44, 44, 44, 54, 54, 54, 54, 65, 65, 65, 65,
+ 21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88, 88, 88, 88,
+ 21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88, 88, 88, 88,
+ 21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88, 88, 88, 88,
+ 21, 21, 21, 21, 22, 22, 22, 22, 25, 25, 25, 25, 31, 31, 31, 31, 41, 41, 41, 41, 54, 54, 54, 54, 70, 70, 70, 70, 88, 88, 88, 88,
+ 24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115,
+ 24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115,
+ 24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115,
+ 24, 24, 24, 24, 25, 25, 25, 25, 29, 29, 29, 29, 36, 36, 36, 36, 47, 47, 47, 47, 65, 65, 65, 65, 88, 88, 88, 88, 115, 115, 115, 115
+};
+
+/**
+*
+* @brief default scaling matrix as specified by standard
+* 32x32 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_scale_mat_32x32[] =
+{
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28,
+ 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+ 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+ 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+ 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33,
+ 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+ 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+ 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+ 17, 17, 17, 17, 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41,
+ 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+ 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+ 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+ 18, 18, 18, 18, 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54,
+ 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+ 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+ 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+ 20, 20, 20, 20, 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71,
+ 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91,
+ 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91,
+ 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91,
+ 24, 24, 24, 24, 25, 25, 25, 25, 28, 28, 28, 28, 33, 33, 33, 33, 41, 41, 41, 41, 54, 54, 54, 54, 71, 71, 71, 71, 91, 91, 91, 91
+};
+
+
+
+/** Default flat ReScaling matrix for 32x32 transform
+ * used for quantization
+ * value[i] = ceil(((1 << 15) -1) / gi2_default_scale_mat_4x4[i])
+ * Since the values are same, 32x32 matrix will be used for all sizes
+ */
+
+const WORD16 gi2_flat_rescale_mat_32x32[] =
+{
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048
+
+};
+
+
+/**
+* Default ReScaling matrix for 8x8 intra transform as defined by standard
+* used for quantization
+* value[i] = ceil(((1 << 15)-1) / gi2_default_scale_mat_4x4[i])
+*/
+
+const WORD16 gi2_intra_default_rescale_mat_8x8[] =
+{
+ 2048, 2048, 2048, 2048, 1928, 1821, 1561, 1366,
+ 2048, 2048, 2048, 2048, 1928, 1725, 1490, 1311,
+ 2048, 2048, 1928, 1821, 1639, 1490, 1311, 1130,
+ 2048, 2048, 1821, 1561, 1366, 1214, 1057, 911,
+ 1928, 1928, 1639, 1366, 1093, 937, 800, 698,
+ 1821, 1725, 1490, 1214, 937, 745, 607, 505,
+ 1561, 1490, 1311, 1057, 800, 607, 469, 373,
+ 1366, 1311, 1130, 911, 698, 505, 373, 285
+};
+
+/**
+*
+* @brief default rescaling scaling matrix as specified by standard
+* 8x8 inter matrix
+* value[i] = ceil(((1 << 15)-1) / gi2_default_scale_mat_4x4[i])
+*
+*/
+const WORD16 gi2_inter_default_rescale_mat_8x8[] =
+{
+ 2048, 2048, 2048, 2048, 1928, 1821, 1639, 1366,
+ 2048, 2048, 2048, 1928, 1821, 1639, 1366, 1311,
+ 2048, 2048, 1928, 1821, 1639, 1366, 1311, 1171,
+ 2048, 1928, 1821, 1639, 1366, 1311, 1171, 993,
+ 1928, 1821, 1639, 1366, 1311, 1171, 993, 800,
+ 1821, 1639, 1366, 1311, 1171, 993, 800, 607,
+ 1639, 1366, 1311, 1171, 993, 800, 607, 462,
+ 1366, 1311, 1171, 993, 800, 607, 462, 361
+};
+
+/**
+*
+* @brief default Rescaling scaling matrix as specified by standard
+* 16x16 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_rescale_mat_16x16[] =
+{
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1561, 1561, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1561, 1561, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1725, 1725, 1490, 1490, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1725, 1725, 1490, 1490, 1311, 1311,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1490, 1490, 1311, 1311, 1130, 1130,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1490, 1490, 1311, 1311, 1130, 1130,
+ 2048, 2048, 2048, 2048, 1821, 1821, 1561, 1561, 1366, 1366, 1214, 1214, 1057, 1057, 911, 911,
+ 2048, 2048, 2048, 2048, 1821, 1821, 1561, 1561, 1366, 1366, 1214, 1214, 1057, 1057, 911, 911,
+ 1928, 1928, 1928, 1928, 1639, 1639, 1366, 1366, 1093, 1093, 937, 937, 800, 800, 698, 698,
+ 1928, 1928, 1928, 1928, 1639, 1639, 1366, 1366, 1093, 1093, 937, 937, 800, 800, 698, 698,
+ 1821, 1821, 1725, 1725, 1490, 1490, 1214, 1214, 937, 937, 745, 745, 607, 607, 505, 505,
+ 1821, 1821, 1725, 1725, 1490, 1490, 1214, 1214, 937, 937, 745, 745, 607, 607, 505, 505,
+ 1561, 1561, 1490, 1490, 1311, 1311, 1057, 1057, 800, 800, 607, 607, 469, 469, 373, 373,
+ 1561, 1561, 1490, 1490, 1311, 1311, 1057, 1057, 800, 800, 607, 607, 469, 469, 373, 373,
+ 1366, 1366, 1311, 1311, 1130, 1130, 911, 911, 698, 698, 505, 505, 373, 373, 285, 285,
+ 1366, 1366, 1311, 1311, 1130, 1130, 911, 911, 698, 698, 505, 505, 373, 373, 285, 285
+};
+
+/**
+*
+* @brief default rescaling scaling matrix as specified by standard
+* 16x16 intra matrix
+*
+*/
+const WORD16 gi2_inter_default_rescale_mat_16x16[] =
+{
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171,
+ 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993,
+ 2048, 2048, 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993,
+ 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800,
+ 1928, 1928, 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800,
+ 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800, 607, 607,
+ 1821, 1821, 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800, 607, 607,
+ 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800, 607, 607, 462, 462,
+ 1639, 1639, 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800, 607, 607, 462, 462,
+ 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800, 607, 607, 462, 462, 361, 361,
+ 1366, 1366, 1311, 1311, 1171, 1171, 993, 993, 800, 800, 607, 607, 462, 462, 361, 361
+};
+
+/**
+*
+* @brief default rescaled scaling matrix as specified by standard
+* 32x32 intra matrix
+*
+*/
+const WORD16 gi2_intra_default_rescale_mat_32x32[] =
+{
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057, 911, 911, 911, 911,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057, 911, 911, 911, 911,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057, 911, 911, 911, 911,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1821, 1821, 1821, 1821, 1561, 1561, 1561, 1561, 1366, 1366, 1366, 1366, 1214, 1214, 1214, 1214, 1057, 1057, 1057, 1057, 911, 911, 911, 911,
+ 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093, 937, 937, 937, 937, 800, 800, 800, 800, 698, 698, 698, 698,
+ 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093, 937, 937, 937, 937, 800, 800, 800, 800, 698, 698, 698, 698,
+ 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093, 937, 937, 937, 937, 800, 800, 800, 800, 698, 698, 698, 698,
+ 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1928, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1093, 1093, 1093, 1093, 937, 937, 937, 937, 800, 800, 800, 800, 698, 698, 698, 698,
+ 1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214, 937, 937, 937, 937, 745, 745, 745, 745, 607, 607, 607, 607, 505, 505, 505, 505,
+ 1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214, 937, 937, 937, 937, 745, 745, 745, 745, 607, 607, 607, 607, 505, 505, 505, 505,
+ 1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214, 937, 937, 937, 937, 745, 745, 745, 745, 607, 607, 607, 607, 505, 505, 505, 505,
+ 1821, 1821, 1821, 1821, 1725, 1725, 1725, 1725, 1490, 1490, 1490, 1490, 1214, 1214, 1214, 1214, 937, 937, 937, 937, 745, 745, 745, 745, 607, 607, 607, 607, 505, 505, 505, 505,
+ 1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057, 800, 800, 800, 800, 607, 607, 607, 607, 469, 469, 469, 469, 373, 373, 373, 373,
+ 1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057, 800, 800, 800, 800, 607, 607, 607, 607, 469, 469, 469, 469, 373, 373, 373, 373,
+ 1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057, 800, 800, 800, 800, 607, 607, 607, 607, 469, 469, 469, 469, 373, 373, 373, 373,
+ 1561, 1561, 1561, 1561, 1490, 1490, 1490, 1490, 1311, 1311, 1311, 1311, 1057, 1057, 1057, 1057, 800, 800, 800, 800, 607, 607, 607, 607, 469, 469, 469, 469, 373, 373, 373, 373,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130, 911, 911, 911, 911, 698, 698, 698, 698, 505, 505, 505, 505, 373, 373, 373, 373, 285, 285, 285, 285,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130, 911, 911, 911, 911, 698, 698, 698, 698, 505, 505, 505, 505, 373, 373, 373, 373, 285, 285, 285, 285,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130, 911, 911, 911, 911, 698, 698, 698, 698, 505, 505, 505, 505, 373, 373, 373, 373, 285, 285, 285, 285,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1130, 1130, 1130, 1130, 911, 911, 911, 911, 698, 698, 698, 698, 505, 505, 505, 505, 373, 373, 373, 373, 285, 285, 285, 285
+};
+
+/**
+*
+* @brief default rescaled scaling matrix as specified by standard
+* 32x32 inter matrix
+*
+*/
+const WORD16 gi2_inter_default_rescale_mat_32x32[] =
+{
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+ 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993,
+ 2048, 2048, 2048, 2048, 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993,
+ 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800,
+ 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800,
+ 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800,
+ 1928, 1928, 1928, 1928, 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800,
+ 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607,
+ 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607,
+ 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607,
+ 1821, 1821, 1821, 1821, 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607,
+ 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462,
+ 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462,
+ 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462,
+ 1639, 1639, 1639, 1639, 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462, 361, 361, 361, 361,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462, 361, 361, 361, 361,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462, 361, 361, 361, 361,
+ 1366, 1366, 1366, 1366, 1311, 1311, 1311, 1311, 1171, 1171, 1171, 1171, 993, 993, 993, 993, 800, 800, 800, 800, 607, 607, 607, 607, 462, 462, 462, 462, 361, 361, 361, 361
+};
+
diff --git a/common/ihevc_quant_tables.h b/common/ihevc_quant_tables.h
new file mode 100644
index 0000000..76d1eea
--- /dev/null
+++ b/common/ihevc_quant_tables.h
@@ -0,0 +1,66 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_quant_tables.h
+*
+* @brief
+* Tables for forward and inverse quantization
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_QUANT_TABLES_H_
+#define _IHEVC_QUANT_TABLES_H_
+
+extern const WORD16 gi2_flat_scale_mat_32x32[];
+
+extern const WORD16 gi2_intra_default_scale_mat_8x8[];
+
+extern const WORD16 gi2_inter_default_scale_mat_8x8[];
+
+extern const WORD16 gi2_intra_default_scale_mat_16x16[];
+
+extern const WORD16 gi2_inter_default_scale_mat_16x16[];
+
+extern const WORD16 gi2_intra_default_scale_mat_32x32[];
+
+extern const WORD16 gi2_inter_default_scale_mat_32x32[];
+
+
+extern const WORD16 gi2_flat_rescale_mat_32x32[];
+
+extern const WORD16 gi2_intra_default_rescale_mat_8x8[];
+
+extern const WORD16 gi2_inter_default_rescale_mat_8x8[];
+
+extern const WORD16 gi2_intra_default_rescale_mat_16x16[];
+
+extern const WORD16 gi2_inter_default_rescale_mat_16x16[];
+
+extern const WORD16 gi2_intra_default_rescale_mat_32x32[];
+
+extern const WORD16 gi2_inter_default_rescale_mat_32x32[];
+
+#endif
diff --git a/common/ihevc_recon.c b/common/ihevc_recon.c
new file mode 100644
index 0000000..9d7015e
--- /dev/null
+++ b/common/ihevc_recon.c
@@ -0,0 +1,461 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_recon.c
+ *
+ * @brief
+ * Functions definitions reconstruction
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevc_recon_4x4_ttype1()
+ * - ihevc_recon_4x4()
+ * - ihevc_recon_8x8()
+ * - ihevc_recon_16x16()
+ * - ihevc_recon_32x32()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+/* All the functions here are replicated from ihevc.c and modified to */
+/* include reconstruction */
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 4x4 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_4x4_ttype1(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_4;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst++;
+ pu1_pred++;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 4x4 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_4x4(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_4;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst++;
+ pu1_pred++;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 8x8 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 8x8 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_8x8(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_8;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst++;
+ pu1_pred++;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 16x16 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 16x16 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_16x16(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_16;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst++;
+ pu1_pred++;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs reconstruction for 32x32 input block
+ *
+ * @par Description:
+ * Performs reconstruction of 32x32 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 32x32 coefficients
+ *
+ * @param[in] pu1_pred
+ * Prediction 32x32 block
+ *
+ * @param[out] pu1_dst
+ * Output 32x32 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_tmp
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_recon_32x32(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols)
+{
+ WORD32 i, j;
+ WORD32 trans_size;
+
+ trans_size = TRANS_SIZE_32;
+
+ /* Reconstruction */
+
+ for(i = 0; i < trans_size; i++)
+ {
+ /* Checking for Zero Cols */
+ if((zero_cols & 1) == 1)
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+ }
+ }
+ else
+ {
+ for(j = 0; j < trans_size; j++)
+ {
+ pu1_dst[j * dst_strd] =
+ CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+ }
+ }
+ pi2_src++;
+ pu1_dst++;
+ pu1_pred++;
+ zero_cols = zero_cols >> 1;
+ }
+}
+
diff --git a/common/ihevc_recon.h b/common/ihevc_recon.h
new file mode 100644
index 0000000..37711ec
--- /dev/null
+++ b/common/ihevc_recon.h
@@ -0,0 +1,124 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_recon.h
+*
+* @brief
+* Functions declarations reconstruction
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_RECON_H_
+#define _IHEVC_RECON_H_
+
+typedef void ihevc_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_recon_4x4_ttype1_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_recon_4x4_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_recon_8x8_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_recon_16x16_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+typedef void ihevc_recon_32x32_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+typedef void ihevc_hbd_recon_32x32_ft(WORD16 *pi2_src,
+ UWORD16 *pu2_pred,
+ UWORD16 *pu2_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ UWORD8 bit_depth);
+
+ihevc_recon_4x4_ttype1_ft ihevc_recon_4x4_ttype1;
+ihevc_hbd_recon_4x4_ttype1_ft ihevc_hbd_recon_4x4_ttype1;
+ihevc_recon_4x4_ft ihevc_recon_4x4;
+ihevc_hbd_recon_4x4_ft ihevc_hbd_recon_4x4;
+ihevc_recon_8x8_ft ihevc_recon_8x8;
+ihevc_hbd_recon_8x8_ft ihevc_hbd_recon_8x8;
+ihevc_recon_16x16_ft ihevc_recon_16x16;
+ihevc_hbd_recon_16x16_ft ihevc_hbd_recon_16x16;
+ihevc_recon_32x32_ft ihevc_recon_32x32;
+ihevc_hbd_recon_32x32_ft ihevc_hbd_recon_32x32;
+
+#endif /*_IHEVC_RECON_H_*/
diff --git a/common/ihevc_sao.c b/common/ihevc_sao.c
new file mode 100644
index 0000000..3b41f0d
--- /dev/null
+++ b/common/ihevc_sao.c
@@ -0,0 +1,1374 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_sao.c
+*
+* @brief
+* Contains leaf level function definitions for sample adaptive offset process
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_sao_band_offset_luma()
+* - ihevc_sao_band_offset_chroma()
+* - ihevc_sao_edge_offset_class0()
+* - ihevc_sao_edge_offset_class0_chroma()
+* - ihevc_sao_edge_offset_class1()
+* - ihevc_sao_edge_offset_class1_chroma()
+* - ihevc_sao_edge_offset_class2()
+* - ihevc_sao_edge_offset_class2_chroma()
+* - ihevc_sao_edge_offset_class3()
+* - ihevc_sao_edge_offset_class3_chroma()
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_sao.h"
+
+#define NUM_BAND_TABLE 32
+
+const WORD32 gi4_ihevc_table_edge_idx[5] = { 1, 2, 0, 3, 4 };
+/**
+ * au4_avail is an array of flags - one for each neighboring block specifying if the block is available
+ * au4_avail[0] - left
+ * au4_avail[1] - right
+ * au4_avail[2] - top
+ * au4_avail[3] - bottom
+ * au4_avail[4] - top-left
+ * au4_avail[5] - top-right
+ * au4_avail[6] - bottom-left
+ * au4_avail[7] - bottom-right
+ */
+
+
+void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 band_shift;
+ WORD32 band_table[NUM_BAND_TABLE];
+ WORD32 i;
+ WORD32 row, col;
+
+ /* Updating left and top and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 1];
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+ band_shift = BIT_DEPTH_LUMA - 5;
+ for(i = 0; i < NUM_BAND_TABLE; i++)
+ {
+ band_table[i] = 0;
+ }
+ for(i = 0; i < 4; i++)
+ {
+ band_table[(i + sao_band_pos) & 31] = i + 1;
+ }
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 band_idx;
+
+ band_idx = band_table[pu1_src[col] >> band_shift];
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[band_idx], 0, (1 << (band_shift + 5)) - 1);
+ }
+ pu1_src += src_strd;
+ }
+}
+
+
+
+/* input 'wd' has to be for the interleaved block and not for each color component */
+void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos_u,
+ WORD32 sao_band_pos_v,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 band_shift;
+ WORD32 band_table_u[NUM_BAND_TABLE];
+ WORD32 band_table_v[NUM_BAND_TABLE];
+ WORD32 i;
+ WORD32 row, col;
+
+ /* Updating left and top and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+ pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+
+ band_shift = BIT_DEPTH_CHROMA - 5;
+ for(i = 0; i < NUM_BAND_TABLE; i++)
+ {
+ band_table_u[i] = 0;
+ band_table_v[i] = 0;
+ }
+ for(i = 0; i < 4; i++)
+ {
+ band_table_u[(i + sao_band_pos_u) & 31] = i + 1;
+ band_table_v[(i + sao_band_pos_v) & 31] = i + 1;
+ }
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 band_idx;
+ WORD8 *pi1_sao_offset;
+
+ pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+ band_idx = (0 == col % 2) ? band_table_u[pu1_src[col] >> band_shift] : band_table_v[pu1_src[col] >> band_shift];
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[band_idx], 0, (1 << (band_shift + 5)) - 1);
+ }
+ pu1_src += src_strd;
+ }
+}
+
+
+
+/* Horizontal filtering */
+void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE];
+ WORD8 u1_sign_left, u1_sign_right;
+ WORD32 bit_depth;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+ bit_depth = BIT_DEPTH_LUMA;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update top and top-left arrays */
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+ /* Update masks based on the availability flags */
+ if(0 == pu1_avail[0])
+ {
+ au1_mask[0] = 0;
+ }
+ if(0 == pu1_avail[1])
+ {
+ au1_mask[wd - 1] = 0;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ u1_sign_left = SIGN(pu1_src[0] - pu1_src_left[row]);
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+
+ u1_sign_right = SIGN(pu1_src[col] - pu1_src[col + 1]);
+ edge_idx = 2 + u1_sign_left + u1_sign_right;
+ u1_sign_left = -u1_sign_right;
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ pu1_src += src_strd;
+ }
+ }
+
+ /* Update left array */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+
+}
+
+
+
+
+/* input 'wd' has to be for the interleaved block and not for each color component */
+void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE];
+ WORD8 u1_sign_left_u, u1_sign_right_u;
+ WORD8 u1_sign_left_v, u1_sign_right_v;
+ WORD32 bit_depth;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+ bit_depth = BIT_DEPTH_CHROMA;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update left, top and top-left arrays */
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
+ au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+ /* Update masks based on the availability flags */
+ if(0 == pu1_avail[0])
+ {
+ au1_mask[0] = 0;
+ }
+ if(0 == pu1_avail[1])
+ {
+ au1_mask[(wd - 1) >> 1] = 0;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ u1_sign_left_u = SIGN(pu1_src[0] - pu1_src_left[2 * row]);
+ u1_sign_left_v = SIGN(pu1_src[1] - pu1_src_left[2 * row + 1]);
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+ WORD8 *pi1_sao_offset;
+
+ if(0 == col % 2)
+ {
+ pi1_sao_offset = pi1_sao_offset_u;
+ u1_sign_right_u = SIGN(pu1_src[col] - pu1_src[col + 2]);
+ edge_idx = 2 + u1_sign_left_u + u1_sign_right_u;
+ u1_sign_left_u = -u1_sign_right_u;
+ }
+ else
+ {
+ pi1_sao_offset = pi1_sao_offset_v;
+ u1_sign_right_v = SIGN(pu1_src[col] - pu1_src[col + 2]);
+ edge_idx = 2 + u1_sign_left_v + u1_sign_right_v;
+ u1_sign_left_v = -u1_sign_right_v;
+ }
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ pu1_src += src_strd;
+ }
+ }
+
+ for(row = 0; row < 2 * ht; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+
+}
+
+
+
+/* Vertical filtering */
+void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
+ WORD8 au1_sign_up[MAX_CTB_SIZE];
+ WORD8 u1_sign_down;
+ WORD32 bit_depth;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ bit_depth = BIT_DEPTH_LUMA;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update left, top and top-left arrays */
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src += src_strd;
+ ht--;
+ for(col = 0; col < wd; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - src_strd]);
+ }
+ }
+ else
+ {
+ for(col = 0; col < wd; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col]);
+ }
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+
+ u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + src_strd]);
+ edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+ au1_sign_up[col] = -u1_sign_down;
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ pu1_src += src_strd;
+ }
+ }
+
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = au1_src_top_tmp[col];
+ }
+
+}
+
+
+
+/* input 'wd' has to be for the interleaved block and not for each color component */
+void ihevc_sao_edge_offset_class1_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
+ WORD8 au1_sign_up[MAX_CTB_SIZE];
+ WORD8 u1_sign_down;
+ WORD32 bit_depth;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ bit_depth = BIT_DEPTH_CHROMA;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update left, top and top-left arrays */
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[2 * row] = pu1_src[row * src_strd + wd - 2];
+ pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src += src_strd;
+ ht--;
+ for(col = 0; col < wd; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - src_strd]);
+ }
+ }
+ else
+ {
+ for(col = 0; col < wd; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col]);
+ }
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+ WORD8 *pi1_sao_offset;
+
+ pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+
+ u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + src_strd]);
+ edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+ au1_sign_up[col] = -u1_sign_down;
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ pu1_src += src_strd;
+ }
+ }
+
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = au1_src_top_tmp[col];
+ }
+
+}
+
+
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
+ UWORD8 u1_src_top_left_tmp;
+ WORD8 au1_sign_up[MAX_CTB_SIZE + 1], au1_sign_up_tmp[MAX_CTB_SIZE + 1];
+ WORD8 u1_sign_down;
+ WORD8 *pu1_sign_up;
+ WORD8 *pu1_sign_up_tmp;
+ UWORD8 *pu1_src_left_cpy;
+
+ WORD32 bit_depth;
+ UWORD8 u1_pos_0_0_tmp;
+ UWORD8 u1_pos_wd_ht_tmp;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_sign_up = au1_sign_up;
+ pu1_sign_up_tmp = au1_sign_up_tmp;
+ pu1_src_left_cpy = pu1_src_left;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update left, top and top-left arrays */
+ u1_src_top_left_tmp = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+
+ /* If top-left is available, process separately */
+ if(0 != pu1_avail[4])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+ SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp = pu1_src[0];
+ }
+ }
+ else
+ {
+ u1_pos_0_0_tmp = pu1_src[0];
+ }
+
+ /* If bottom-right is available, process separately */
+ if(0 != pu1_avail[7])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
+ SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+
+ /* If Left is not available */
+ if(0 == pu1_avail[0])
+ {
+ au1_mask[0] = 0;
+ }
+
+ /* If Top is not available */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src += src_strd;
+ ht--;
+ pu1_src_left_cpy += 1;
+ for(col = 1; col < wd; col++)
+ {
+ pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - 1 - src_strd]);
+ }
+ }
+ else
+ {
+ for(col = 1; col < wd; col++)
+ {
+ pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col - 1]);
+ }
+ }
+
+ /* If Right is not available */
+ if(0 == pu1_avail[1])
+ {
+ au1_mask[wd - 1] = 0;
+ }
+
+ /* If Bottom is not available */
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ pu1_sign_up[0] = SIGN(pu1_src[0] - pu1_src_left_cpy[row - 1]);
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+
+ u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + 1 + src_strd]);
+ edge_idx = 2 + pu1_sign_up[col] + u1_sign_down;
+ pu1_sign_up_tmp[col + 1] = -u1_sign_down;
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ /* Swapping pu1_sign_up_tmp and pu1_sign_up */
+ {
+ WORD8 *pu1_swap_tmp = pu1_sign_up;
+ pu1_sign_up = pu1_sign_up_tmp;
+ pu1_sign_up_tmp = pu1_swap_tmp;
+ }
+
+ pu1_src += src_strd;
+ }
+
+ pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd] = u1_pos_0_0_tmp;
+ pu1_src[(pu1_avail[3] ? wd - 1 - src_strd : wd - 1)] = u1_pos_wd_ht_tmp;
+ }
+
+ if(0 == pu1_avail[2])
+ ht++;
+ if(0 == pu1_avail[3])
+ ht++;
+ *pu1_src_top_left = u1_src_top_left_tmp;
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = au1_src_top_tmp[col];
+ }
+
+}
+
+
+
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
+ UWORD8 au1_src_top_left_tmp[2];
+ WORD8 au1_sign_up[MAX_CTB_SIZE + 2], au1_sign_up_tmp[MAX_CTB_SIZE + 2];
+ WORD8 u1_sign_down;
+ WORD8 *pu1_sign_up;
+ WORD8 *pu1_sign_up_tmp;
+ UWORD8 *pu1_src_left_cpy;
+
+ WORD32 bit_depth;
+
+ UWORD8 u1_pos_0_0_tmp_u;
+ UWORD8 u1_pos_0_0_tmp_v;
+ UWORD8 u1_pos_wd_ht_tmp_u;
+ UWORD8 u1_pos_wd_ht_tmp_v;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+
+ bit_depth = BIT_DEPTH_CHROMA;
+ pu1_sign_up = au1_sign_up;
+ pu1_sign_up_tmp = au1_sign_up_tmp;
+ pu1_src_left_cpy = pu1_src_left;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update left, top and top-left arrays */
+ au1_src_top_left_tmp[0] = pu1_src_top[wd - 2];
+ au1_src_top_left_tmp[1] = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
+ au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+
+ /* If top-left is available, process separately */
+ if(0 != pu1_avail[4])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+ SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp_u = pu1_src[0];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
+ SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp_v = pu1_src[1];
+ }
+ }
+ else
+ {
+ u1_pos_0_0_tmp_u = pu1_src[0];
+ u1_pos_0_0_tmp_v = pu1_src[1];
+ }
+
+ /* If bottom-right is available, process separately */
+ if(0 != pu1_avail[7])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
+ SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
+ SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+ u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+
+ /* If Left is not available */
+ if(0 == pu1_avail[0])
+ {
+ au1_mask[0] = 0;
+ }
+
+ /* If Top is not available */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src += src_strd;
+ pu1_src_left_cpy += 2;
+ ht--;
+ for(col = 2; col < wd; col++)
+ {
+ pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - 2 - src_strd]);
+ }
+ }
+ else
+ {
+ for(col = 2; col < wd; col++)
+ {
+ pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col - 2]);
+ }
+ }
+
+ /* If Right is not available */
+ if(0 == pu1_avail[1])
+ {
+ au1_mask[(wd - 1) >> 1] = 0;
+ }
+
+ /* If Bottom is not available */
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ pu1_sign_up[0] = SIGN(pu1_src[0] - pu1_src_left_cpy[2 * (row - 1)]);
+ pu1_sign_up[1] = SIGN(pu1_src[1] - pu1_src_left_cpy[2 * (row - 1) + 1]);
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+ WORD8 *pi1_sao_offset;
+
+ pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+
+ u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + 2 + src_strd]);
+ edge_idx = 2 + pu1_sign_up[col] + u1_sign_down;
+ pu1_sign_up_tmp[col + 2] = -u1_sign_down;
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ /* Swapping pu1_sign_up_tmp and pu1_sign_up */
+ {
+ WORD8 *pu1_swap_tmp = pu1_sign_up;
+ pu1_sign_up = pu1_sign_up_tmp;
+ pu1_sign_up_tmp = pu1_swap_tmp;
+ }
+
+ pu1_src += src_strd;
+ }
+
+ pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd] = u1_pos_0_0_tmp_u;
+ pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + 1] = u1_pos_0_0_tmp_v;
+ pu1_src[(pu1_avail[3] ? wd - 2 - src_strd : wd - 2)] = u1_pos_wd_ht_tmp_u;
+ pu1_src[(pu1_avail[3] ? wd - 1 - src_strd : wd - 1)] = u1_pos_wd_ht_tmp_v;
+ }
+
+ if(0 == pu1_avail[2])
+ ht++;
+ if(0 == pu1_avail[3])
+ ht++;
+ pu1_src_top_left[0] = au1_src_top_left_tmp[0];
+ pu1_src_top_left[1] = au1_src_top_left_tmp[1];
+ for(row = 0; row < 2 * ht; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = au1_src_top_tmp[col];
+ }
+
+}
+
+
+
+
+/* 45 degree filtering */
+void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE];
+ UWORD8 u1_src_top_left_tmp;
+ WORD8 au1_sign_up[MAX_CTB_SIZE];
+ UWORD8 *pu1_src_left_cpy;
+ WORD8 u1_sign_down;
+ WORD32 bit_depth;
+
+ UWORD8 u1_pos_0_ht_tmp;
+ UWORD8 u1_pos_wd_0_tmp;
+
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_left_cpy = pu1_src_left;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update left, top and top-left arrays */
+ u1_src_top_left_tmp = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+ /* If top-right is available, process separately */
+ if(0 != pu1_avail[5])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
+ SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp = pu1_src[wd - 1];
+ }
+ }
+ else
+ {
+ u1_pos_wd_0_tmp = pu1_src[wd - 1];
+ }
+
+ /* If bottom-left is available, process separately */
+ if(0 != pu1_avail[6])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+ }
+
+ /* If Left is not available */
+ if(0 == pu1_avail[0])
+ {
+ au1_mask[0] = 0;
+ }
+
+ /* If Top is not available */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src += src_strd;
+ ht--;
+ pu1_src_left_cpy += 1;
+ for(col = 0; col < wd - 1; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col + 1 - src_strd]);
+ }
+ }
+ else
+ {
+ for(col = 0; col < wd - 1; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col + 1]);
+ }
+ }
+
+ /* If Right is not available */
+ if(0 == pu1_avail[1])
+ {
+ au1_mask[wd - 1] = 0;
+ }
+
+ /* If Bottom is not available */
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ au1_sign_up[wd - 1] = SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 + 1 - src_strd]);
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+
+ u1_sign_down = SIGN(pu1_src[col] - ((col == 0) ? pu1_src_left_cpy[row + 1] :
+ pu1_src[col - 1 + src_strd]));
+ edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+ if(col > 0)
+ au1_sign_up[col - 1] = -u1_sign_down;
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ pu1_src += src_strd;
+ }
+
+ pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 1] = u1_pos_wd_0_tmp;
+ pu1_src[(pu1_avail[3] ? (-src_strd) : 0)] = u1_pos_0_ht_tmp;
+ }
+
+ if(0 == pu1_avail[2])
+ ht++;
+ if(0 == pu1_avail[3])
+ ht++;
+ *pu1_src_top_left = u1_src_top_left_tmp;
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = au1_src_top_tmp[col];
+ }
+
+}
+
+
+
+
+void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 au1_mask[MAX_CTB_SIZE];
+ UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
+ UWORD8 au1_src_top_left_tmp[2];
+ WORD8 au1_sign_up[MAX_CTB_SIZE];
+ UWORD8 *pu1_src_left_cpy;
+ WORD8 u1_sign_down;
+ WORD32 bit_depth;
+
+ UWORD8 u1_pos_wd_0_tmp_u;
+ UWORD8 u1_pos_wd_0_tmp_v;
+ UWORD8 u1_pos_0_ht_tmp_u;
+ UWORD8 u1_pos_0_ht_tmp_v;
+
+ bit_depth = BIT_DEPTH_CHROMA;
+ pu1_src_left_cpy = pu1_src_left;
+
+ /* Initialize the mask values */
+ memset(au1_mask, 0xFF, MAX_CTB_SIZE);
+
+ /* Update left, top and top-left arrays */
+ au1_src_top_left_tmp[0] = pu1_src_top[wd - 2];
+ au1_src_top_left_tmp[1] = pu1_src_top[wd - 1];
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
+ au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
+ }
+
+
+ /* If top-right is available, process separately */
+ if(0 != pu1_avail[5])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
+ SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
+ SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+ }
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+ u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+ }
+
+ /* If bottom-left is available, process separately */
+ if(0 != pu1_avail[6])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+ }
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+ u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+ }
+
+ /* If Left is not available */
+ if(0 == pu1_avail[0])
+ {
+ au1_mask[0] = 0;
+ }
+
+ /* If Top is not available */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src += src_strd;
+ ht--;
+ pu1_src_left_cpy += 2;
+ for(col = 0; col < wd - 2; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col + 2 - src_strd]);
+ }
+ }
+ else
+ {
+ for(col = 0; col < wd - 2; col++)
+ {
+ au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col + 2]);
+ }
+ }
+
+ /* If Right is not available */
+ if(0 == pu1_avail[1])
+ {
+ au1_mask[(wd - 1) >> 1] = 0;
+ }
+
+ /* If Bottom is not available */
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ /* Processing is done on the intermediate buffer and the output is written to the source buffer */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ au1_sign_up[wd - 2] = SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 + 2 - src_strd]);
+ au1_sign_up[wd - 1] = SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 + 2 - src_strd]);
+ for(col = 0; col < wd; col++)
+ {
+ WORD32 edge_idx;
+ WORD8 *pi1_sao_offset;
+
+ pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
+
+ u1_sign_down = SIGN(pu1_src[col] - ((col < 2) ? pu1_src_left_cpy[2 * (row + 1) + col] :
+ pu1_src[col - 2 + src_strd]));
+ edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
+ if(col > 1)
+ au1_sign_up[col - 2] = -u1_sign_down;
+
+ edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];
+
+ if(0 != edge_idx)
+ {
+ pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ }
+
+ pu1_src += src_strd;
+ }
+
+ pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 2] = u1_pos_wd_0_tmp_u;
+ pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 1] = u1_pos_wd_0_tmp_v;
+ pu1_src[(pu1_avail[3] ? (-src_strd) : 0)] = u1_pos_0_ht_tmp_u;
+ pu1_src[(pu1_avail[3] ? (-src_strd) : 0) + 1] = u1_pos_0_ht_tmp_v;
+ }
+
+ if(0 == pu1_avail[2])
+ ht++;
+ if(0 == pu1_avail[3])
+ ht++;
+ pu1_src_top_left[0] = au1_src_top_left_tmp[0];
+ pu1_src_top_left[1] = au1_src_top_left_tmp[1];
+ for(row = 0; row < 2 * ht; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+ for(col = 0; col < wd; col++)
+ {
+ pu1_src_top[col] = au1_src_top_tmp[col];
+ }
+
+}
diff --git a/common/ihevc_sao.h b/common/ihevc_sao.h
new file mode 100644
index 0000000..7d6fafa
--- /dev/null
+++ b/common/ihevc_sao.h
@@ -0,0 +1,402 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+******************************************************************************
+* @file ihevc_sao.h
+*
+* @brief
+* This file contains enumerations, macros and extern declarations of HEVC
+* SAO
+*
+* @author
+* Ittiam
+******************************************************************************
+*/
+
+#ifndef _IHEVC_SAO_H_
+#define _IHEVC_SAO_H_
+
+enum
+{
+ SAO_NONE,
+
+ SAO_BAND,
+
+ SAO_EDGE_0_DEG,
+
+ SAO_EDGE_90_DEG,
+
+ SAO_EDGE_135_DEG,
+
+ SAO_EDGE_45_DEG
+};
+
+static const WORD32 gi4_ihevc_hbd_table_edge_idx[5] = { 1, 2, 0, 3, 4 };
+
+typedef void ihevc_sao_band_offset_luma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos,
+ WORD8 *pi4_sao_offset,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_band_offset_luma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ WORD32 sao_band_pos,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bitdepth);
+
+typedef void ihevc_sao_band_offset_chroma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos_u,
+ WORD32 sao_band_pos_v,
+ WORD8 *pi4_sao_offset_u,
+ WORD8 *pi4_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_band_offset_chroma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ WORD32 sao_band_pos_u,
+ WORD32 sao_band_pos_v,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class0_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class0_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class0_chroma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset_u,
+ WORD8 *pi4_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class0_chroma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class1_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class1_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class1_chroma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset_u,
+ WORD8 *pi4_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class1_chroma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class2_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class2_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class2_chroma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset_u,
+ WORD8 *pi4_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class2_chroma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+
+typedef void ihevc_sao_edge_offset_class3_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class3_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+typedef void ihevc_sao_edge_offset_class3_chroma_ft(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi4_sao_offset_u,
+ WORD8 *pi4_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht);
+
+typedef void ihevc_hbd_sao_edge_offset_class3_chroma_ft(UWORD16 *pu2_src,
+ WORD32 src_strd,
+ UWORD16 *pu2_src_left,
+ UWORD16 *pu2_src_top,
+ UWORD16 *pu2_src_top_left,
+ UWORD16 *pu2_src_top_right,
+ UWORD16 *pu2_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht,
+ UWORD32 bit_depth);
+/* C function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma;
+
+/* NEONINTR function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_neonintr;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_neonintr;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_neonintr;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_neonintr;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_neonintr;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_neonintr;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_neonintr;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_neonintr;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_neonintr;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_neonintr;
+
+/* A9Q function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_a9q;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_a9q;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_a9q;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_a9q;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_a9q;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_a9q;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_a9q;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_a9q;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_a9q;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_a9q;
+
+/* A9A (Apple) function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_a9a;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_a9a;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_a9a;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_a9a;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_a9a;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_a9a;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_a9a;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_a9a;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_a9a;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_a9a;
+
+/* SSSE31 function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_ssse3;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_ssse3;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_ssse3;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_ssse3;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_ssse3;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_ssse3;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_ssse3;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_ssse3;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_ssse3;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_ssse3;
+
+/* SSE4 function declarations */
+
+/* C high bit depth function declarations */
+ihevc_hbd_sao_band_offset_luma_ft ihevc_hbd_sao_band_offset_luma;
+ihevc_hbd_sao_band_offset_chroma_ft ihevc_hbd_sao_band_offset_chroma;
+ihevc_hbd_sao_edge_offset_class0_ft ihevc_hbd_sao_edge_offset_class0;
+ihevc_hbd_sao_edge_offset_class0_chroma_ft ihevc_hbd_sao_edge_offset_class0_chroma;
+ihevc_hbd_sao_edge_offset_class1_ft ihevc_hbd_sao_edge_offset_class1;
+ihevc_hbd_sao_edge_offset_class1_chroma_ft ihevc_hbd_sao_edge_offset_class1_chroma;
+ihevc_hbd_sao_edge_offset_class2_ft ihevc_hbd_sao_edge_offset_class2;
+ihevc_hbd_sao_edge_offset_class2_chroma_ft ihevc_hbd_sao_edge_offset_class2_chroma;
+ihevc_hbd_sao_edge_offset_class3_ft ihevc_hbd_sao_edge_offset_class3;
+ihevc_hbd_sao_edge_offset_class3_chroma_ft ihevc_hbd_sao_edge_offset_class3_chroma;
+
+/* SSE4.2 HBD function Declarations*/
+ihevc_hbd_sao_band_offset_luma_ft ihevc_hbd_sao_band_offset_luma_sse42;
+ihevc_hbd_sao_band_offset_chroma_ft ihevc_hbd_sao_band_offset_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class0_ft ihevc_hbd_sao_edge_offset_class0_sse42;
+ihevc_hbd_sao_edge_offset_class0_chroma_ft ihevc_hbd_sao_edge_offset_class0_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class1_ft ihevc_hbd_sao_edge_offset_class1_sse42;
+ihevc_hbd_sao_edge_offset_class1_chroma_ft ihevc_hbd_sao_edge_offset_class1_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class2_ft ihevc_hbd_sao_edge_offset_class2_sse42;
+ihevc_hbd_sao_edge_offset_class2_chroma_ft ihevc_hbd_sao_edge_offset_class2_chroma_sse42;
+ihevc_hbd_sao_edge_offset_class3_ft ihevc_hbd_sao_edge_offset_class3_sse42;
+ihevc_hbd_sao_edge_offset_class3_chroma_ft ihevc_hbd_sao_edge_offset_class3_chroma_sse42;
+
+/* armv8 function declarations */
+ihevc_sao_band_offset_luma_ft ihevc_sao_band_offset_luma_av8;
+ihevc_sao_band_offset_chroma_ft ihevc_sao_band_offset_chroma_av8;
+ihevc_sao_edge_offset_class0_ft ihevc_sao_edge_offset_class0_av8;
+ihevc_sao_edge_offset_class0_chroma_ft ihevc_sao_edge_offset_class0_chroma_av8;
+ihevc_sao_edge_offset_class1_ft ihevc_sao_edge_offset_class1_av8;
+ihevc_sao_edge_offset_class1_chroma_ft ihevc_sao_edge_offset_class1_chroma_av8;
+ihevc_sao_edge_offset_class2_ft ihevc_sao_edge_offset_class2_av8;
+ihevc_sao_edge_offset_class2_chroma_ft ihevc_sao_edge_offset_class2_chroma_av8;
+ihevc_sao_edge_offset_class3_ft ihevc_sao_edge_offset_class3_av8;
+ihevc_sao_edge_offset_class3_chroma_ft ihevc_sao_edge_offset_class3_chroma_av8;
+
+#endif /* _IHEVC_SAO_H_ */
diff --git a/common/ihevc_structs.h b/common/ihevc_structs.h
new file mode 100644
index 0000000..26857d8
--- /dev/null
+++ b/common/ihevc_structs.h
@@ -0,0 +1,2884 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_structs.h
+ *
+ * @brief
+ * Structure definitions used in the code
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVC_STRUCTS_H_
+#define _IHEVC_STRUCTS_H_
+
+/**
+ * Picture buffer
+ */
+typedef struct
+{
+ UWORD8 *pu1_luma;
+ UWORD8 *pu1_chroma;
+
+ WORD32 i4_abs_poc;
+ WORD32 i4_poc_lsb;
+ /** Used to store display Timestamp for current buffer */
+ WORD32 u4_ts;
+ UWORD8 u1_used_as_ref;
+
+ UWORD8 u1_free_delay_cnt;
+
+ /**
+ * buffer ID from buffer manager
+ */
+ UWORD8 u1_buf_id;
+
+}pic_buf_t;
+
+
+/**
+ * Reference List
+ */
+typedef struct
+{
+ void *pv_pic_buf;
+
+ void *pv_mv_buf;
+
+ UWORD8 u1_used_as_ref;
+
+}ref_list_t;
+
+
+/**
+ * SAO
+ */
+typedef struct
+{
+ /**
+ * sao_type_idx_luma
+ */
+ UWORD32 b3_y_type_idx : 3;
+
+ /**
+ * luma SaoOffsetVal[1]
+ */
+ WORD32 b4_y_offset_1 : 4;
+
+ /**
+ * luma SaoOffsetVal[2]
+ */
+ WORD32 b4_y_offset_2 : 4;
+
+ /**
+ * luma SaoOffsetVal[3]
+ */
+ WORD32 b4_y_offset_3 : 4;
+
+ /**
+ * luma SaoOffsetVal[4]
+ */
+ WORD32 b4_y_offset_4 : 4;
+
+ /**
+ * luma sao_band_position
+ */
+ UWORD32 b5_y_band_pos : 5;
+
+ WORD32 : 0;
+
+ /**
+ * sao_type_idx_chroma
+ */
+ UWORD32 b3_cb_type_idx : 3;
+
+ /**
+ * chroma SaoOffsetVal[1]
+ */
+ WORD32 b4_cb_offset_1 : 4;
+
+ /**
+ * chroma SaoOffsetVal[2]
+ */
+ WORD32 b4_cb_offset_2 : 4;
+
+ /**
+ * chroma SaoOffsetVal[3]
+ */
+ WORD32 b4_cb_offset_3 : 4;
+
+ /**
+ * chroma SaoOffsetVal[4]
+ */
+ WORD32 b4_cb_offset_4 : 4;
+
+ /**
+ * cb sao_band_position
+ */
+ UWORD32 b5_cb_band_pos : 5;
+
+ WORD32 : 0;
+
+ /**
+ * sao_type_idx_chroma
+ */
+ UWORD32 b3_cr_type_idx : 3;
+
+ /**
+ * chroma SaoOffsetVal[1]
+ */
+ WORD32 b4_cr_offset_1 : 4;
+
+ /**
+ * chroma SaoOffsetVal[2]
+ */
+ WORD32 b4_cr_offset_2 : 4;
+
+ /**
+ * chroma SaoOffsetVal[3]
+ */
+ WORD32 b4_cr_offset_3 : 4;
+
+ /**
+ * chroma SaoOffsetVal[4]
+ */
+ WORD32 b4_cr_offset_4 : 4;
+
+ /**
+ * cr sao_band_position
+ */
+ UWORD32 b5_cr_band_pos : 5;
+
+ WORD32 : 0;
+
+}sao_t;
+
+/**
+ * SAO
+ */
+typedef struct
+{
+ /**
+ * sao_type_idx_luma
+ */
+ UWORD32 b3_y_type_idx : 3;
+
+ /**
+ * luma SaoOffsetVal[1]
+ */
+ WORD32 b8_y_offset_1 : 8;
+
+ /**
+ * luma SaoOffsetVal[2]
+ */
+ WORD32 b8_y_offset_2 : 8;
+
+ /**
+ * luma SaoOffsetVal[3]
+ */
+ WORD32 b8_y_offset_3 : 8;
+
+ /**
+ * luma SaoOffsetVal[4]
+ */
+ WORD32 b8_y_offset_4 : 8;
+
+ /**
+ * luma sao_band_position
+ */
+ UWORD32 b5_y_band_pos : 5;
+
+ WORD32 : 0;
+
+ /**
+ * sao_type_idx_chroma
+ */
+ UWORD32 b3_cb_type_idx : 3;
+
+ /**
+ * chroma SaoOffsetVal[1]
+ */
+ WORD32 b8_cb_offset_1 : 8;
+
+ /**
+ * chroma SaoOffsetVal[2]
+ */
+ WORD32 b8_cb_offset_2 : 8;
+
+ /**
+ * chroma SaoOffsetVal[3]
+ */
+ WORD32 b8_cb_offset_3 : 8;
+
+ /**
+ * chroma SaoOffsetVal[4]
+ */
+ WORD32 b8_cb_offset_4 : 8;
+
+ /**
+ * cb sao_band_position
+ */
+ UWORD32 b5_cb_band_pos : 5;
+
+ WORD32 : 0;
+
+ /**
+ * sao_type_idx_chroma
+ */
+ UWORD32 b3_cr_type_idx : 3;
+
+ /**
+ * chroma SaoOffsetVal[1]
+ */
+ WORD32 b8_cr_offset_1 : 8;
+
+ /**
+ * chroma SaoOffsetVal[2]
+ */
+ WORD32 b8_cr_offset_2 : 8;
+
+ /**
+ * chroma SaoOffsetVal[3]
+ */
+ WORD32 b8_cr_offset_3 : 8;
+
+ /**
+ * chroma SaoOffsetVal[4]
+ */
+ WORD32 b8_cr_offset_4 : 8;
+
+ /**
+ * cr sao_band_position
+ */
+ UWORD32 b5_cr_band_pos : 5;
+
+ WORD32 : 0;
+
+}sao_10bd_t;
+
+/**
+ * Motion vector
+ */
+typedef struct
+{
+ /**
+ * Horizontal Motion Vector
+ */
+ WORD16 i2_mvx;
+
+ /**
+ * Vertical Motion Vector
+ */
+ WORD16 i2_mvy;
+}mv_t;
+
+/*****************************************************************************/
+/* Following results in packed 48 bit structure. If mv_t included */
+/* ref_pic_buf_id, then 8 bits will be wasted for each mv for aligning. */
+/* Also using mv_t as elements directly instead of a pointer to l0 and l1 */
+/* mvs. Since pointer takes 4 bytes and MV itself is 4 bytes. It does not */
+/* really help using pointers. */
+/*****************************************************************************/
+
+/**
+ * PU Motion Vector info
+ */
+typedef struct
+{
+ /**
+ * L0 Motion Vector
+ */
+ mv_t s_l0_mv;
+
+ /**
+ * L1 Motion Vector
+ */
+ mv_t s_l1_mv;
+
+ /**
+ * L0 Ref index
+ */
+ WORD8 i1_l0_ref_idx;
+
+ /**
+ * L1 Ref index
+ */
+ WORD8 i1_l1_ref_idx;
+
+ /**
+ * L0 Ref Pic Buf ID
+ */
+ WORD8 i1_l0_ref_pic_buf_id;
+
+ /**
+ * L1 Ref Pic Buf ID
+ */
+ WORD8 i1_l1_ref_pic_buf_id;
+
+}pu_mv_t;
+
+/**
+ * PU information
+ */
+typedef struct
+{
+
+ /**
+ * PU motion vectors
+ */
+ pu_mv_t mv;
+
+ /**
+ * PU X position in terms of min PU (4x4) units
+ */
+ UWORD32 b4_pos_x : 4;
+
+ /**
+ * PU Y position in terms of min PU (4x4) units
+ */
+ UWORD32 b4_pos_y : 4;
+
+ /**
+ * PU width in pixels = (b4_wd + 1) << 2
+ */
+ UWORD32 b4_wd : 4;
+
+ /**
+ * PU height in pixels = (b4_ht + 1) << 2
+ */
+ UWORD32 b4_ht : 4;
+
+ /**
+ * Intra or Inter flag for each partition - 0 or 1
+ */
+ UWORD32 b1_intra_flag : 1;
+
+
+ /**
+ * PRED_L0, PRED_L1, PRED_BI - Initialized in parsing only for MVP case
+ */
+ UWORD32 b2_pred_mode : 2;
+
+
+/**
+ * Merge flag for each partition - 0 or 1
+ */
+ UWORD32 b1_merge_flag : 1;
+
+ /**
+ * Merge index for each partition - 0 to 4
+ */
+ UWORD32 b3_merge_idx : 3;
+
+ /*************************************************************************/
+ /* Following two flags can be overloaded with b3_merge_idx if there */
+ /* is need for additional bits */
+ /*************************************************************************/
+
+ /**
+ * If merge is zero, following gives presence of mvd for L0 MV
+ */
+ UWORD32 b1_l0_mvp_idx : 1;
+
+ /**
+ * If merge is zero, following gives presence of mvd for L1 MV
+ */
+ UWORD32 b1_l1_mvp_idx : 1;
+
+ /**
+ * Partition mode - Needed during MV merge stage
+ * Note: Part mode can be derived using pu_wd, pu_ht and minCB size
+ * If there is a need for bits, the following can be removed at the cost
+ * of more control code in MV Merge
+ */
+ UWORD32 b3_part_mode : 3;
+
+ /**
+ * Partition index - Needed during MV merge stage
+ */
+ UWORD32 b2_part_idx : 2;
+
+
+}pu_t;
+
+/**
+ * TU information
+ */
+typedef struct
+{
+ /**
+ * TU X position in terms of min TU (4x4) units
+ */
+ UWORD32 b4_pos_x : 4;
+
+ /**
+ * TU Y position in terms of min TU (4x4) units
+ */
+ UWORD32 b4_pos_y : 4;
+
+
+ /*************************************************************************/
+ /* Luma TU size (width or height) = 1 << (b3_size + 2) */
+ /* i.e. 0 : 4, 1 : 8, 2: 16, 3: 32, 4: 64 */
+ /* Note: Though 64 x 64 TU is not possible, this size is supported to */
+ /* signal SKIP CUs or PCM CUs etc where transform is not called */
+ /* Chroma width will be half of luma except for 4x4 luma */
+ /*************************************************************************/
+ /**
+ * Luma TU size (width or height)
+ */
+ UWORD32 b3_size : 3; //To be changed.
+
+ /*************************************************************************/
+ /* Chroma present : For 4x4 Luma TUs only the fourth one contains Cb */
+ /* Cr info. For the first three TUs in 8x8 (for 4x4 luma) this will */
+ /* be zero. For all the other cases this will be 1 */
+ /*************************************************************************/
+
+ /**
+ * 4x4 Luma TUs only the fourth one contains cb,cr
+ * TODO: Check if this is really needed, cb_cbf and cr_cbf should be enough
+ */
+ //UWORD32 b1_chroma_present : 1;
+
+ /**
+ * Y CBF
+ */
+ UWORD32 b1_y_cbf : 1;
+
+ /**
+ * Cb CBF
+ */
+ UWORD32 b1_cb_cbf : 1;
+
+ /**
+ * Cr CBF
+ */
+ UWORD32 b1_cr_cbf : 1;
+
+
+ /**
+ * Flag to indicate if it is the first TU in a CU
+ */
+ UWORD32 b1_first_tu_in_cu : 1;
+
+ /**
+ * Transform quant bypass flag
+ */
+ UWORD32 b1_transquant_bypass : 1;
+
+ /**
+ * Y Qp
+ */
+ //UWORD32 b6_qp : 6; // BUG_FIX related to nighbour QP's in case of negative QP for HBD.
+ WORD32 b7_qp : 7;
+
+
+ /**
+ * Luma Intra Mode 0 - 34
+ */
+ UWORD32 b6_luma_intra_mode : 6;
+
+ /*************************************************************************/
+ /* Chroma Intra Mode Index 0 - 4: Actual mode (0, 1, 10, 26, 34, X) to be*/
+ /* derived using luma_intra_mode and the following */
+ /*************************************************************************/
+ /**
+ * Chroma Intra Mode Index 0 - 4
+ */
+ UWORD32 b3_chroma_intra_mode_idx : 3;
+
+
+}tu_t;
+
+/**
+ * CU information
+ */
+typedef struct
+{
+
+ /**
+ * CU X position in terms of min CU (8x8) units
+ */
+ UWORD32 b3_cu_pos_x :3;
+
+ /**
+ * CU Y position in terms of min CU (8x8) units
+ */
+ UWORD32 b3_cu_pos_y :3;
+
+ /**
+ * CU size in terms of min CU (8x8) units
+ */
+ UWORD32 b4_cu_size :4;
+
+ /**
+ * transquant bypass flag ; 0 for this encoder
+ */
+ UWORD32 b1_tq_bypass_flag :1;
+
+ /**
+ * CU skip flag
+ */
+ UWORD32 b1_skip_flag :1;
+
+ /**
+ * intra / inter CU flag
+ */
+ UWORD32 b1_pred_mode_flag :1;
+
+ /**
+ * indicates partition information for CU
+ * For intra 0 : for 2Nx2N / 1 for NxN iff CU=minCBsize
+ * For inter 0 : @sa PART_SIZE_E
+ */
+ UWORD32 b3_part_mode :3;
+
+ /**
+ * 0 for this encoder
+ */
+ UWORD32 b1_pcm_flag :1;
+
+ /**
+ * only applicable for intra cu
+ */
+ UWORD32 b3_chroma_intra_pred_mode :3;
+
+ /**
+ * only applicable for intra cu
+ */
+ UWORD32 b1_prev_intra_luma_pred_flag0 :1;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b1_prev_intra_luma_pred_flag1 :1;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b1_prev_intra_luma_pred_flag2 :1;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b1_prev_intra_luma_pred_flag3 :1;
+
+ /**
+ * only applicable for luma intra cu
+ */
+ UWORD32 b2_mpm_idx0 :2;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b2_mpm_idx1 :2;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b2_mpm_idx2 :2;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b2_mpm_idx3 :2;
+
+ /**
+ * only applicable for intra cu
+ */
+ UWORD32 b5_rem_intra_pred_mode0 :5;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b5_rem_intra_pred_mode1 :5;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b5_rem_intra_pred_mode2 :5;
+
+ /**
+ * only applicable for intra cu and pred_mode=NxN
+ */
+ UWORD32 b5_rem_intra_pred_mode3 :5;
+
+ /**
+ * no residue flag for cu
+ */
+ UWORD32 b1_no_residual_syntax_flag :1;
+
+}cu_t;
+
+/*****************************************************************************/
+/* Since the following data will be accessed linearly (no random access */
+/* is needed for this) there is no need to store a frame level offset for */
+/* each CTB's TU data. Only a pointer to this is stored in CTB's structure */
+/*****************************************************************************/
+
+typedef struct
+{
+ /*************************************************************************/
+ /* Number of TUs filled in as_tu */
+ /* Having the first entry as 32 bit data, helps in keeping each of */
+ /* the structures aligned to 32 bits at CTB level */
+ /*************************************************************************/
+ /**
+ * Number of TUs filled in as_tu
+ */
+ WORD32 i4_tu_cnt;
+
+ /**
+ * Array to map each min TU unit to a corresponding entry in as_tu
+ */
+ UWORD8 au1_tu_map[MAX_TU_IN_CTB];
+
+ /*************************************************************************/
+ /* TU level information */
+ /* Though the allocation for as_pu as done to handle worst case data, */
+ /* only valid number of TUs will be filled in the following array. */
+ /* Next CTB starts after the valid as_tu entries */
+ /*************************************************************************/
+ /**
+ * TU level information
+ */
+ tu_t as_tu[MAX_TU_IN_CTB];
+
+}ctb_tu_list_t;
+
+/*****************************************************************************/
+/* Info from last TU row of CTB is stored in a row level neighbour buffer */
+/* , which will be used for Boundary Strength computation */
+/*****************************************************************************/
+/**
+ * CTB neighbor info
+ */
+typedef struct
+{
+ /**
+ * Slice index of the ctb
+ */
+ UWORD16 u2_slice_idx;
+
+ /*************************************************************************/
+ /* CBF of bottom TU row (replicated in 4 pixel boundary) */
+ /* MSB contains CBF of first TU in the last row and LSB contains CBF */
+ /* of last TU in the last row */
+ /*************************************************************************/
+ /**
+ * CBF of bottom TU row
+ */
+ UWORD16 u2_packed_cbf;
+
+ /*************************************************************************/
+ /* QP of bottom TU row (replicated at 8 pixel boundary (Since QP can */
+ /* not change at less than min CU granularity) */
+ /*************************************************************************/
+ /**
+ * QP of bottom TU row
+ */
+ UWORD8 au1_qp[MAX_CU_IN_CTB_ROW];
+
+}ctb_top_ny_info_t;
+
+/**
+ * CTB level info
+ */
+typedef struct _ctb_t
+{
+ /*************************************************************************/
+ /* Tile boundary can be detected by looking at tile start x and tile */
+ /* start y. And based on the tile, slice and frame boundary the */
+ /* following will be initialized. */
+ /*************************************************************************/
+ /**
+ * Pointer to left CTB
+ */
+ /* If not available, this will be set to NULL */
+ struct _ctb_t *ps_ctb_left;
+
+ /**
+ * Pointer to top-left CTB
+ */
+ /* If not available, this will be set to NULL */
+ ctb_top_ny_info_t *ps_ctb_ny_topleft;
+
+ /**
+ * Pointer to top CTB
+ */
+ /* If not available, this will be set to NULL */
+ ctb_top_ny_info_t *ps_ctb_ny_top;
+
+ /**
+ * Pointer to top-right CTB
+ */
+ /* If not available, this will be set to NULL */
+ ctb_top_ny_info_t *ps_ctb_ny_topright;
+
+ /*************************************************************************/
+ /* Pointer to PU data. */
+ /* This points to a MV Bank stored at frame level. Though this */
+ /* pointer can be derived by reading offset at frame level, it is */
+ /* stored here for faster access. Can be removed if storage of CTB */
+ /* structure is critical */
+ /*************************************************************************/
+ /**
+ * Pointer to PU data
+ */
+ pu_t *ps_pu;
+
+ /*************************************************************************/
+ /* Pointer to a PU map stored at frame level, */
+ /* Though this pointer can be derived by multiplying CTB adress with */
+ /* number of minTUs in a CTB, it is stored here for faster access. */
+ /* Can be removed if storage of CTB structure is critical */
+ /*************************************************************************/
+ /**
+ * Pointer to a PU map stored at frame level
+ */
+ UWORD8 *pu1_pu_map;
+
+ /**
+ * Number of TUs filled in as_tu
+ */
+ /*************************************************************************/
+ /* Having the first entry as 32 bit data, helps in keeping each of */
+ /* the structures aligned to 32 bits at CTB level */
+ /*************************************************************************/
+ WORD32 i4_tu_cnt;
+
+ /**
+ * Array to map each min TU unit to a corresponding entry in as_tu
+ */
+ UWORD8 *pu1_tu_map;
+
+ /**
+ * TU level information
+ */
+ /*************************************************************************/
+ /* Though the allocation for as_pu as done to handle worst case data, */
+ /* only valid number of TUs will be filled in the following array. */
+ /* Next CTB starts after the valid as_tu entries */
+ /*************************************************************************/
+ tu_t *ps_tu;
+
+ /**
+ * Pointer to transform coeff data
+ */
+ /*************************************************************************/
+ /* Following format is repeated for every coded TU */
+ /* Luma Block */
+ /* num_coeffs : 16 bits */
+ /* zero_cols : 8 bits ( 1 bit per 4 columns) */
+ /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */
+ /* coeff_data : Non zero coefficients */
+ /* Cb Block (only for last TU in 4x4 case else for every luma TU) */
+ /* num_coeffs : 16 bits */
+ /* zero_cols : 8 bits ( 1 bit per 4 columns) */
+ /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */
+ /* coeff_data : Non zero coefficients */
+ /* Cr Block (only for last TU in 4x4 case else for every luma TU) */
+ /* num_coeffs : 16 bits */
+ /* zero_cols : 8 bits ( 1 bit per 4 columns) */
+ /* sig_coeff_map : ((TU Size * TU Size) + 31) >> 5 number of WORD32s */
+ /* coeff_data : Non zero coefficients */
+ /*************************************************************************/
+ void *pv_coeff_data;
+
+ /**
+ * Slice to which the CTB belongs to
+ */
+ WORD32 i4_slice_idx;
+
+ /**
+ * CTB column position
+ */
+ WORD32 i4_pos_x;
+
+ /**
+ * CTB row position
+ */
+ WORD32 i4_pos_y;
+
+ /**
+ * Number of PUs filled in ps_pu
+ */
+ WORD32 i4_pu_cnt;
+
+ /**
+ * Index of current PU being processed in ps_pu
+ */
+ /* Scratch variable set to 0 at the start of any PU processing function */
+ WORD32 i4_pu_idx;
+
+ /**
+ * Vertical Boundary strength
+ */
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *pu4_vert_bs;
+
+ /**
+ * Horizontal Boundary strength
+ */
+
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *pu4_horz_bs;
+
+ /**
+ * Qp array stored for each 8x8 pixels
+ */
+ UWORD8 *pu1_qp;
+
+ /**
+ * Pointer to current frame's pu_t array
+ */
+ pu_t *ps_frm_pu;
+
+ /**
+ * Pointer to current frame's pu_t index array, which stores starting index
+ * of pu_t for every CTB
+ */
+ UWORD32 *pu4_frm_pu_idx;
+
+ /**
+ * Pointer to current frame's pu map array
+ */
+ UWORD8 *pu1_frm_pu_map;
+
+ /*************************************************************************/
+ /* Need to add encoder specific elements for identifying the order of */
+ /* coding for CU, TU and PU if any */
+ /*************************************************************************/
+}ctb_t;
+
+/*****************************************************************************/
+/* The following can be used to typecast coefficient data that is stored */
+/* per subblock. Note that though i2_level is shown as an array that */
+/* holds 16 coefficients, only the first few entries will be valid. Next */
+/* subblocks data starts after the valid number of coefficients. Number */
+/* of non-zero coefficients will be derived using number of non-zero bits */
+/* in sig coeff map */
+/*****************************************************************************/
+/**
+ * Structure to hold coefficient info for a 4x4 subblock
+ */
+typedef struct
+{
+ /**
+ * sub block position
+ */
+ UWORD16 u2_subblk_pos;
+
+ /**
+ * significant coefficient map
+ */
+ UWORD16 u2_sig_coeff_map;
+
+ /**
+ * holds 16 coefficients
+ */
+ WORD16 ai2_level[SUBBLK_COEFF_CNT];
+}tu_sblk_coeff_data_t;
+
+
+
+/*************************************************************************/
+/* The following describes how each of the CU cases are handled */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For SKIP CU */
+/* One Inter PU with appropriate MV */
+/* One TU which says Y, Cb and Cr CBF is zero with size equal to CB size */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Inter CU */
+/* M Inter PU with appropriate MVs (M between 1 to 4) */
+/* N TU (N is number of TU in CU) */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Intra CU */
+/* N TU (N is number of TU in CU) */
+/* N Intra PU with appropriate pred modes for luma and chroma */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For Intra PCM CU */
+/* One TU which says transquant bypass is 1 with size equal to CB size */
+/* 1 Intra PU with pcm flag set to 1(which ensures no intra pred is done)*/
+/*************************************************************************/
+
+/*************************************************************************/
+/* For a CU where cu_transquant_bypass_flag is 1 */
+/* One TU which says transquant bypass is 1 with size equal to CB size */
+/* N Intra/Inter PUs */
+/*************************************************************************/
+
+/*************************************************************************/
+/* For a CU where no_residual_syntax_flag is 1 */
+/* One TU which says Y, Cb, Cr CBF is 0 with size equal to CB size */
+/* N Inter PUs */
+/*************************************************************************/
+
+#if 0
+
+/*************************************************************************/
+/* Keeping the following as arrays instead of pointers helps in */
+/* reducing number of redirections and hence faster access to the */
+/* data. But the downside is this results in unused memory holes */
+/* after each array, since the allocation is for worst case but */
+/* number of valid CU, TU and PU will be much lesser than worst case. */
+/* Since the holes are three per CTB, it should not be so much of a */
+/* problem. */
+/*************************************************************************/
+
+/* CU level information */
+/* TODO: If there is not much data that is stored at CU level, then the
+ following will be removed */
+cu_t as_cu[MAX_CU_IN_CTB];
+
+#endif
+
+/**
+ * Structure giving information about the tile
+ */
+typedef struct
+{
+ /* X position of the tile in the current frame in CTB units */
+ UWORD8 u1_pos_x;
+
+ /* Y position of the tile in the current frame in CTB units */
+ UWORD8 u1_pos_y;
+
+ /* Tile width in CTB units */
+ UWORD16 u2_wd;
+
+ /* Tile height in CTB units */
+ UWORD16 u2_ht;
+
+}tile_t;
+
+/**
+ * Structure to hold Profile tier level info for a given layer
+ */
+
+typedef struct
+{
+ /**
+ * NAL unit type
+ */
+ WORD8 i1_nal_unit_type;
+
+ /**
+ * NAL temporal id
+ */
+ WORD8 i1_nuh_temporal_id;
+}nal_header_t;
+
+/**
+ * Structure to hold Profile tier level info for a given layer
+ */
+
+typedef struct
+{
+ /**
+ * profile_space
+ */
+ WORD8 i1_profile_space;
+
+ /**
+ * tier_flag
+ */
+ WORD8 i1_tier_flag;
+
+ /**
+ * profile_idc
+ */
+ WORD8 i1_profile_idc;
+
+ /**
+ * profile_compatibility_flag[]
+ */
+ WORD8 ai1_profile_compatibility_flag[MAX_PROFILE_COMPATBLTY];
+
+ /**
+ * progressive_source_flag
+ */
+ WORD8 i1_general_progressive_source_flag;
+
+ /**
+ * interlaced_source_flag
+ */
+ WORD8 i1_general_interlaced_source_flag;
+
+ /**
+ * non_packed_constraint_flag
+ */
+ WORD8 i1_general_non_packed_constraint_flag;
+
+ /**
+ * frame_only_constraint_flag
+ */
+ WORD8 i1_frame_only_constraint_flag;
+
+ /**
+ * level_idc
+ */
+ UWORD8 u1_level_idc;
+}profile_tier_lvl_t;
+
+/**
+ * Structure to hold Profile tier level info for all layers
+ */
+typedef struct
+{
+ /**
+ * Profile and tier information for general
+ */
+ profile_tier_lvl_t s_ptl_gen;
+
+ /**
+ * sub_layer_profile_present_flag[]
+ */
+ WORD8 ai1_sub_layer_profile_present_flag[VPS_MAX_SUB_LAYERS - 1];
+
+ /**
+ * sub_layer_level_present_flag[]
+ */
+ WORD8 ai1_sub_layer_level_present_flag[VPS_MAX_SUB_LAYERS - 1];
+
+ /**
+ * Profile and tier information for sub layers
+ */
+ profile_tier_lvl_t as_ptl_sub[VPS_MAX_SUB_LAYERS - 1];
+
+}profile_tier_lvl_info_t;
+
+/**
+ * Structure to hold short term reference picture set info
+ */
+typedef struct
+{
+ /**
+ * delta_poc_s0_minus1[ i ] and delta_poc_s1_minus1[ i ]
+ */
+ WORD16 ai2_delta_poc[MAX_DPB_SIZE];
+
+ /**
+ * inter_ref_pic_set_prediction_flag
+ */
+ WORD8 i1_inter_ref_pic_set_prediction_flag;
+
+ /**
+ * num_negative_pics
+ */
+ WORD8 i1_num_neg_pics;
+
+ /**
+ * num_positive_pics
+ */
+ WORD8 i1_num_pos_pics;
+
+ /**
+ * used_by_curr_pic_s0_flag[ i ] and used_by_curr_pic_s1_flag[i]
+ */
+ WORD8 ai1_used[MAX_DPB_SIZE];
+
+ /**
+ * Ref Idc
+ */
+ WORD8 ai1_ref_idc[MAX_DPB_SIZE];
+
+ /**
+ * Sum of positive and negative pics for each refence
+ */
+ WORD8 i1_num_delta_pocs;
+
+ /**
+ * Number of ref_idc
+ */
+ WORD8 i1_num_ref_idc;
+}stref_picset_t;
+
+/**
+ * Structure to hold weighted prediction info such as weights and offsets
+ */
+typedef struct
+{
+ /** luma_log2_weight_denom */
+ WORD8 i1_luma_log2_weight_denom;
+
+ /** delta_chroma_log2_weight_denom */
+ WORD8 i1_chroma_log2_weight_denom;
+
+ /** luma_weight_l0_flag[ i ] */
+ WORD8 i1_luma_weight_l0_flag[MAX_DPB_SIZE];
+
+ /** chroma_weight_l0_flag[ i ] */
+ WORD8 i1_chroma_weight_l0_flag[MAX_DPB_SIZE];
+
+ /** delta_luma_weight_l0[ i ] */
+ WORD16 i2_luma_weight_l0[MAX_DPB_SIZE];
+
+ /** luma_offset_l0[ i ] */
+ WORD16 i2_luma_offset_l0[MAX_DPB_SIZE];
+
+ /** delta_chroma_weight_l0[ i ][ j ] */
+ WORD16 i2_chroma_weight_l0_cb[MAX_DPB_SIZE];
+
+ /** delta_chroma_offset_l0[ i ][ j ] */
+ WORD16 i2_chroma_offset_l0_cb[MAX_DPB_SIZE];
+
+ /** delta_chroma_weight_l0[ i ][ j ] */
+ WORD16 i2_chroma_weight_l0_cr[MAX_DPB_SIZE];
+
+ /** delta_chroma_offset_l0[ i ][ j ] */
+ WORD16 i2_chroma_offset_l0_cr[MAX_DPB_SIZE];
+
+ /** luma_weight_l1_flag[ i ] */
+ WORD8 i1_luma_weight_l1_flag[MAX_DPB_SIZE];
+
+ /** chroma_weight_l1_flag[ i ] */
+ WORD8 i1_chroma_weight_l1_flag[MAX_DPB_SIZE];
+
+ /** delta_luma_weight_l1[ i ] */
+ WORD16 i2_luma_weight_l1[MAX_DPB_SIZE];
+
+ /** luma_offset_l1[ i ] */
+ WORD16 i2_luma_offset_l1[MAX_DPB_SIZE];
+
+ /** delta_chroma_weight_l1[ i ][ j ] */
+ WORD16 i2_chroma_weight_l1_cb[MAX_DPB_SIZE];
+
+ /** delta_chroma_offset_l1[ i ][ j ] */
+ WORD16 i2_chroma_offset_l1_cb[MAX_DPB_SIZE];
+
+ /** delta_chroma_weight_l1[ i ][ j ] */
+ WORD16 i2_chroma_weight_l1_cr[MAX_DPB_SIZE];
+
+ /** delta_chroma_offset_l1[ i ][ j ] */
+ WORD16 i2_chroma_offset_l1_cr[MAX_DPB_SIZE];
+
+}pred_wt_ofst_t;
+
+
+/**
+ * Structure to hold Reference picture list modification info
+ */
+typedef struct
+{
+ /* ref_pic_list_modification_flag_l0 */
+ WORD8 i1_ref_pic_list_modification_flag_l0;
+
+ /* list_entry_l0[ i ] */
+ WORD8 i1_list_entry_l0[16];
+
+ /* ref_pic_list_modification_flag_l1 */
+ WORD8 i1_ref_pic_list_modification_flag_l1;
+
+ /* list_entry_l1[ i ] */
+ WORD8 i1_list_entry_l1[16];
+}rplm_t;
+
+
+/**
+ * Structure to hold VPS info
+ */
+typedef struct
+{
+ /**
+ * video_parameter_set_id
+ */
+ WORD8 i1_vps_id;
+
+ /**
+ * vps_temporal_id_nesting_flag
+ */
+ WORD8 i1_vps_temporal_id_nesting_flag;
+ /**
+ * sub_layer_ordering_info_present_flag
+ */
+ WORD8 i1_sub_layer_ordering_info_present_flag;
+ /**
+ * vps_max_sub_layers_minus1
+ */
+ WORD8 i1_vps_max_sub_layers;
+
+ /**
+ * vps_max_dec_pic_buffering
+ */
+ WORD8 ai1_vps_max_dec_pic_buffering[VPS_MAX_SUB_LAYERS];
+
+ /**
+ * vps_max_num_reorder_pics
+ */
+ WORD8 ai1_vps_max_num_reorder_pics[VPS_MAX_SUB_LAYERS];
+
+ /**
+ * vps_max_latency_increase
+ */
+ WORD8 ai1_vps_max_latency_increase[VPS_MAX_SUB_LAYERS];
+
+ /**
+ * vps_num_hrd_parameters
+ */
+ WORD8 i1_vps_num_hrd_parameters;
+
+ /**
+ * vps_max_nuh_reserved_zero_layer_id
+ */
+ WORD8 i1_vps_max_nuh_reserved_zero_layer_id;
+
+ /**
+ * vps_num_op_sets
+ */
+ WORD8 i1_vps_num_op_sets;
+
+ /**
+ * layer_id_included_flag
+ */
+ //WORD8 ai1_layer_id_included_flag[2][MAX_NUH_LAYERS];
+ /**
+ * Profile, Tier and Level info
+ */
+ profile_tier_lvl_info_t s_ptl;
+
+ /**
+ * bit_rate_info_present_flag[i]
+ */
+ WORD8 ai1_bit_rate_info_present_flag[VPS_MAX_SUB_LAYERS];
+
+
+ /**
+ * pic_rate_info_present_flag[i]
+ */
+ WORD8 ai1_pic_rate_info_present_flag[VPS_MAX_SUB_LAYERS];
+
+ /**
+ * avg_bit_rate[i]
+ */
+ UWORD16 au2_avg_bit_rate[VPS_MAX_SUB_LAYERS];
+ /**
+ * max_bit_rate[i]
+ */
+ UWORD16 au2_max_bit_rate[VPS_MAX_SUB_LAYERS];
+ /**
+ * constant_pic_rate_idc[i]
+ */
+ WORD8 ai1_constant_pic_rate_idc[VPS_MAX_SUB_LAYERS];
+ /**
+ * avg_pic_rate[i]
+ */
+ UWORD16 au2_avg_pic_rate[VPS_MAX_SUB_LAYERS];
+}vps_t;
+
+/**
+ * Sub-layer HRD parameters Info
+ */
+typedef struct
+{
+ /**
+ * (together with bit_rate_scale) specifies the
+ * maximum input bit rate for the i-th CPB
+ */
+ UWORD32 au4_bit_rate_value_minus1[32];
+ /**
+ * together with cpb_size_scale to specify the
+ * CPB size when the CPB operates at the access unit level.
+ */
+ UWORD32 au4_cpb_size_value_minus1[32];
+
+ /**
+ * together with cpb_size_du_scale to specify the CPB size
+ * when the CPB operates at sub-picture level
+ */
+ UWORD32 au4_cpb_size_du_value_minus1[32];
+
+ /**
+ * specifies the maximum input bit rate for the i-th CPB when the CPB
+ * operates at the sub-picture level. bit_rate_du_value_minus1[ i ]
+ * shall be in the range of 0 to 2^32 - 2
+ */
+ UWORD32 au4_bit_rate_du_value_minus1[32];
+
+ /**
+ * if 1, specifies that the HSS operates in a constant bit rate (CBR) mode
+ * if 0, specifies that the HSS operates in a intermittent bit rate (CBR) mode
+ */
+ UWORD8 au1_cbr_flag[32];
+
+}sub_lyr_hrd_params_t;
+
+/**
+ * HRD parameters Info
+ */
+typedef struct
+{
+
+ /**
+ * Indicates the presence of the
+ * num_units_in_ticks, time_scale flag
+ */
+ UWORD8 u1_timing_info_present_flag;
+
+ /**
+ * Number of units that
+ * correspond to one increment of the
+ * clock. Indicates the resolution
+ */
+ UWORD32 u4_num_units_in_tick;
+
+ /**
+ * The number of time units that pass in one second
+ */
+ UWORD32 u4_time_scale;
+
+ /**
+ * Nal- hrd parameters flag
+ */
+ UWORD8 u1_nal_hrd_parameters_present_flag;
+
+ /**
+ * VCL- hrd parameters flag
+ */
+ UWORD8 u1_vcl_hrd_parameters_present_flag;
+
+ /**
+ * Indicates the presence of NAL-HRD params or VCL_HRD params
+ * in the bitstream
+ */
+ UWORD8 u1_cpbdpb_delays_present_flag;
+
+ /**
+ * specifies that sub-picture level CPB removal delay parameters are
+ * present in picture timing SEI messages
+ */
+ UWORD8 u1_sub_pic_cpb_params_present_flag;
+
+ /**
+ * specify the clock sub-tick
+ * (the minimum interval of time that can be represented in the coded data when sub_pic_cpb_params_present_flag is equal to 1)
+ */
+ UWORD8 u1_tick_divisor_minus2;
+
+ /**
+ * specifies the length, in bits for the du cpb delay syntax in pt_sei
+ */
+ UWORD8 u1_du_cpb_removal_delay_increment_length_minus1;
+
+ /**
+ * Indicates presence of sub_pic_cpb_params in pic timing sei
+ */
+ UWORD8 u1_sub_pic_cpb_params_in_pic_timing_sei_flag;
+
+ /**
+ * specifies the length, in bits, of the pic_dpb_output_du_delay syntax
+ * element in the picture timing SEI message and the
+ * pic_spt_dpb_output_du_delay syntax element in the decoding unit
+ * information SEI message
+ */
+ UWORD8 u1_dpb_output_delay_du_length_minus1;
+
+ /**
+ * (together with bit_rate_value_minus1) specifies the
+ * maximum input bit rate of the i-th CPB
+ */
+ UWORD32 u4_bit_rate_scale;
+
+ /**
+ * (together with cpb_size_du_value_minus1) specfies
+ * CPB size of the i-th CPB when the CPB operates
+ * at the access unit level
+ */
+ UWORD32 u4_cpb_size_scale;
+
+ /**
+ * (together with cpb_size_du_value_minus1) specfies
+ * CPB size of the i-th CPB when the CPB operates
+ * at the sub-picture level
+ */
+ UWORD32 u4_cpb_size_du_scale;
+
+
+ /**
+ * specifies the length, in bits for initial cpb delay (nal/vcl)sysntax in bp sei
+ */
+ UWORD8 u1_initial_cpb_removal_delay_length_minus1;
+
+ /**
+ * specifies the length, in bits for the au cpb delay syntax in pt_sei
+ */
+ UWORD8 u1_au_cpb_removal_delay_length_minus1;
+
+ /**
+ * specifies the length, in bits, of the pic_dpb_output_delay syntax element in the pt SEI message
+ */
+ UWORD8 u1_dpb_output_delay_length_minus1;
+
+ /**
+ * if 1, , for the highest temporal sub-layers, the temporal distance between the HRD output times
+ * of consecutive pictures in output order is constrained refer to Table E-6
+ */
+ UWORD8 au1_fixed_pic_rate_general_flag[6];
+
+ UWORD8 au1_fixed_pic_rate_within_cvs_flag[6];
+
+ /**
+ * if 1, , for the highest temporal sub-layers, the temporal distance (in clock ticks) between the
+ * element units that specify HRD output times of consecutive pictures in output order is constrained
+ * refer to Table E-6
+ */
+ UWORD8 au1_elemental_duration_in_tc_minus1[6];
+
+ /**
+ * specifies the HRD operational mode
+ */
+ UWORD8 au1_low_delay_hrd_flag[6];
+
+ /**
+ * 1 specifies the number of alternative CPB specifications in the
+ * bitstream of the cvs when HighestTid is equal to i
+ */
+ UWORD8 au1_cpb_cnt_minus1[6];
+
+
+ /**
+ * VUI level Sub-layer HRD parameters
+ */
+ sub_lyr_hrd_params_t as_sub_layer_hrd_params[6];
+
+}hrd_params_t;
+
+
+/**
+ * Structure to hold VUI parameters Info
+ */
+typedef struct
+{
+ /**
+ * indicates the presence of aspect_ratio
+ */
+ UWORD8 u1_aspect_ratio_info_present_flag;
+
+ /**
+ * specifies the aspect ratio of the luma samples
+ */
+ UWORD8 u1_aspect_ratio_idc;
+
+ /**
+ * width of the luma samples. user dependent
+ */
+ UWORD16 u2_sar_width;
+
+ /**
+ * hieght of the luma samples. user dependent
+ */
+ UWORD16 u2_sar_height;
+
+ /**
+ * if 1, specifies that the overscan_appropriate_flag is present
+ * if 0, the preferred display method for the video signal is unspecified
+ */
+ UWORD8 u1_overscan_info_present_flag;
+
+ /**
+ * if 1,indicates that the cropped decoded pictures output
+ * are suitable for display using overscan
+ */
+ UWORD8 u1_overscan_appropriate_flag;
+
+ /**
+ * if 1 specifies that video_format, video_full_range_flag and
+ * colour_description_present_flag are present
+ */
+ UWORD8 u1_video_signal_type_present_flag;
+
+ /**
+ *
+ */
+ UWORD8 u1_video_format;
+
+ /**
+ * indicates the black level and range of the luma and chroma signals
+ */
+ UWORD8 u1_video_full_range_flag;
+
+ /**
+ * if 1,to 1 specifies that colour_primaries, transfer_characteristics
+ * and matrix_coefficients are present
+ */
+ UWORD8 u1_colour_description_present_flag;
+
+ /**
+ * indicates the chromaticity coordinates of the source primaries
+ */
+ UWORD8 u1_colour_primaries;
+
+ /**
+ * indicates the opto-electronic transfer characteristic of the source picture
+ */
+ UWORD8 u1_transfer_characteristics;
+
+ /**
+ * the matrix coefficients used in deriving luma and chroma signals
+ * from the green, blue, and red primaries
+ */
+ UWORD8 u1_matrix_coefficients;
+
+ /**
+ * if 1, specifies that chroma_sample_loc_type_top_field and
+ * chroma_sample_loc_type_bottom_field are present
+ */
+ UWORD8 u1_chroma_loc_info_present_flag;
+
+ /**
+ * location of chroma samples
+ */
+ UWORD8 u1_chroma_sample_loc_type_top_field;
+
+ UWORD8 u1_chroma_sample_loc_type_bottom_field;
+
+ /**
+ * if 1, indicates that the value of all decoded chroma samples is
+ * equal to 1 << ( BitDepthC - 1 )
+ */
+ UWORD8 u1_neutral_chroma_indication_flag;
+
+ /**
+ * 1 indicates that the coded video sequence conveys pictures that represent fields
+ * 0 indicates the pictures that represents field
+ */
+ UWORD8 u1_field_seq_flag;
+
+ /**
+ * specifies that picture timing SEI messages are present for every picture
+ */
+ UWORD8 u1_frame_field_info_present_flag;
+
+ /**
+ * 1 indicates that the default display window parameters follow next in the VUI
+ */
+ UWORD8 u1_default_display_window_flag;
+
+ /**
+ * specify the samples of the pictures in the coded video sequence
+ * that are within the default display window,
+ * in terms of a rectangular region specified in picture coordinates for display
+ */
+ UWORD32 u4_def_disp_win_left_offset;
+
+ UWORD32 u4_def_disp_win_right_offset;
+
+ UWORD32 u4_def_disp_win_top_offset;
+
+ UWORD32 u4_def_disp_win_bottom_offset;
+
+ /**
+ * to 1 specifies that the syntax structure hrd_parameters is present in the vui_parameters syntax structue
+ */
+ UWORD8 u1_vui_hrd_parameters_present_flag;
+
+ /**
+ * VUI level HRD parameters
+ */
+ hrd_params_t s_vui_hrd_parameters;
+
+ /**
+ * Indicates the presence of the
+ * num_units_in_ticks, time_scale flag
+ */
+ UWORD8 u1_vui_timing_info_present_flag;
+
+ /**
+ * Number of units that
+ * correspond to one increment of the
+ * clock. Indicates the resolution
+ */
+ UWORD32 u4_vui_num_units_in_tick;
+
+ /**
+ * The number of time units that pass in one second
+ */
+ UWORD32 u4_vui_time_scale;
+ /**
+ * if 1, indicates that the POC for each picture in the coded video sequence (cvs) (not the first picture), in decoding order,
+ * is proportional to the output time of the picture relative to that of the first picture in the cvs
+ */
+ UWORD8 u1_poc_proportional_to_timing_flag;
+
+ /**
+ * num_ticks_poc_diff_one_minus1 plus 1 specifies the number of clock ticks
+ * corresponding to a difference of poc values equal to 1
+ */
+ UWORD8 u1_num_ticks_poc_diff_one_minus1;
+
+ /**
+ * 1, specifies that the following cvs bitstream restriction parameters are present
+ */
+ UWORD8 u1_bitstream_restriction_flag;
+
+ /**
+ * if 1, indicates that each pps that is active in the cvs has
+ * the same value of the tile syntax elements
+ */
+ UWORD8 u1_tiles_fixed_structure_flag;
+
+ /**
+ * if 0, indicates that no pel outside the pic boundaries and
+ * no sub-pels derived using pels outside the pic boundaries is used for inter prediction
+ */
+ UWORD8 u1_motion_vectors_over_pic_boundaries_flag;
+
+ /**
+ * if 1, indicates
+ * all P/B slices belonging to the same pic have an identical refpic list0,
+ * all B slices that belong to the same picture have an identical refpic list1.
+ */
+ UWORD8 u1_restricted_ref_pic_lists_flag;
+
+ /**
+ * min_spatial_segmentation_idc, when not equal to 0, establishes a bound on the maximum possible size of distinct
+ * coded spatial segmentation regions in the pictures of the CVS. When min_spatial_segmentation_idc is not present, it is
+ * inferred to be equal to 0. The value of min_spatial_segmentation_idc shall be in the range of 0 to 4095, inclusive.
+ *
+ * can be used by a decoder to calculate the maximum number of luma samples to be processed by one processing thread
+ *
+ * If tiles=0 and entropy_sync=0 then
+ * no slice shall exceed ( 4 * PicSizeInSamplesY ) / minSpatialSegmentationTimes4 luma samples
+ *
+ * If tiles=1 and entropy_sync=0 then
+ * no tile shall exceed ( 4 * PicSizeInSamplesY ) / minSpatialSegmentationTimes4 luma samples
+ *
+ * If tiles=0 and entropy_sync=1 then
+ * ( 2 * pic_height_in_luma_samples + pic_width_in_luma_samples ) * CtbSizeY
+ * <= ( 4 * PicSizeInSamplesY ) / minSpatialSegmentationTimes4
+ */
+ UWORD32 u4_min_spatial_segmentation_idc;
+ /**
+ * Indicates a number of bytes not exceeded by the sum of the sizes of the VCL NAL units
+ * associated with any coded picture
+ */
+ UWORD8 u1_max_bytes_per_pic_denom;
+
+ /**
+ * Indicates an upper bound for the number of bits of coding_unit() data
+ */
+ UWORD8 u1_max_bits_per_mincu_denom;
+
+ /**
+ * Indicate the maximum absolute value of a decoded horizontal MV component
+ * in quarter-pel luma units
+ */
+ UWORD8 u1_log2_max_mv_length_horizontal;
+
+ /**
+ * Indicate the maximum absolute value of a decoded vertical MV component
+ * in quarter-pel luma units
+ */
+ UWORD8 u1_log2_max_mv_length_vertical;
+
+
+}vui_t;
+
+
+/**
+ * Structure to hold SPS info
+ */
+typedef struct
+{
+ /**
+ * pic_width_in_luma_samples
+ */
+ WORD16 i2_pic_width_in_luma_samples;
+
+ /**
+ * pic_height_in_luma_samples
+ */
+ WORD16 i2_pic_height_in_luma_samples;
+
+ /**
+ * pic_crop_left_offset
+ */
+ WORD16 i2_pic_crop_left_offset;
+
+ /**
+ * pic_crop_right_offset
+ */
+ WORD16 i2_pic_crop_right_offset;
+
+ /**
+ * pic_crop_top_offset
+ */
+ WORD16 i2_pic_crop_top_offset;
+
+ /**
+ * pic_crop_bottom_offset
+ */
+ WORD16 i2_pic_crop_bottom_offset;
+
+ /**
+ * seq_parameter_set_id
+ */
+ WORD8 i1_sps_id;
+
+ /**
+ * video_parameter_set_id
+ */
+ WORD8 i1_vps_id;
+
+ /**
+ * sps_max_sub_layers_minus1
+ */
+ WORD8 i1_sps_max_sub_layers;
+
+ /**
+ * chroma_format_idc
+ */
+ WORD8 i1_chroma_format_idc;
+
+ /**
+ * Bit depth of luma samples
+ */
+ WORD8 i1_bit_depth_luma_minus8;
+
+ /**
+ * Bit depth of chrma samples
+ */
+ WORD8 i1_bit_depth_chroma_minus8;
+
+ /* separate_colour_plane_flag */
+ WORD8 i1_separate_colour_plane_flag;
+
+ /**
+ * pic_cropping_flag
+ */
+ WORD8 i1_pic_cropping_flag;
+
+ /**
+ * pcm_enabled_flag
+ */
+ WORD8 i1_pcm_enabled_flag;
+
+ /**
+ * pcm_sample_bit_depth_luma
+ */
+ WORD8 i1_pcm_sample_bit_depth_luma;
+
+ /**
+ * pcm_sample_bit_depth_chroma
+ */
+ WORD8 i1_pcm_sample_bit_depth_chroma;
+
+ /**
+ * log2_max_pic_order_cnt_lsb_minus4
+ */
+ WORD8 i1_log2_max_pic_order_cnt_lsb;
+ /**
+ * sps_sub_layer_ordering_info_present_flag
+ */
+ WORD8 i1_sps_sub_layer_ordering_info_present_flag;
+ /**
+ * sps_max_dec_pic_buffering
+ */
+ WORD8 ai1_sps_max_dec_pic_buffering[SPS_MAX_SUB_LAYERS];
+
+ /**
+ * sps_max_num_reorder_pics
+ */
+ WORD8 ai1_sps_max_num_reorder_pics[SPS_MAX_SUB_LAYERS];
+
+ /**
+ * sps_max_latency_increase
+ */
+ WORD8 ai1_sps_max_latency_increase[SPS_MAX_SUB_LAYERS];
+
+ /**
+ * log2_min_coding_block_size_minus3
+ */
+ WORD8 i1_log2_min_coding_block_size;
+
+ /**
+ * log2_diff_max_min_coding_block_size
+ */
+ WORD8 i1_log2_diff_max_min_coding_block_size;
+
+ /**
+ * log2_min_transform_block_size_minus2
+ */
+ WORD8 i1_log2_min_transform_block_size;
+
+ /**
+ * log2_diff_max_min_transform_block_size
+ */
+ WORD8 i1_log2_diff_max_min_transform_block_size;
+
+ /**
+ * log2_min_pcm_coding_block_size_minus3
+ */
+ WORD8 i1_log2_min_pcm_coding_block_size;
+
+ /**
+ * log2_diff_max_min_pcm_coding_block_size
+ */
+ WORD8 i1_log2_diff_max_min_pcm_coding_block_size;
+
+ /**
+ * max_transform_hierarchy_depth_inter
+ */
+ WORD8 i1_max_transform_hierarchy_depth_inter;
+
+ /**
+ * max_transform_hierarchy_depth_intra
+ */
+ WORD8 i1_max_transform_hierarchy_depth_intra;
+
+ /**
+ * scaling_list_enable_flag
+ */
+ WORD8 i1_scaling_list_enable_flag;
+
+ /**
+ * sps_scaling_list_data_present_flag
+ */
+ WORD8 i1_sps_scaling_list_data_present_flag;
+
+ /**
+ * amp_enabled_flag
+ */
+ WORD8 i1_amp_enabled_flag;
+
+ /**
+ * sample_adaptive_offset_enabled_flag
+ */
+ WORD8 i1_sample_adaptive_offset_enabled_flag;
+
+ /**
+ * pcm_loop_filter_disable_flag
+ */
+ WORD8 i1_pcm_loop_filter_disable_flag;
+
+ /**
+ * sps_temporal_id_nesting_flag
+ */
+ WORD8 i1_sps_temporal_id_nesting_flag;
+
+ /**
+ * num_short_term_ref_pic_sets
+ */
+ WORD8 i1_num_short_term_ref_pic_sets;
+
+ /**
+ * long_term_ref_pics_present_flag
+ */
+ WORD8 i1_long_term_ref_pics_present_flag;
+
+ /**
+ * num_long_term_ref_pics_sps
+ */
+ WORD8 i1_num_long_term_ref_pics_sps;
+
+ /**
+ * lt_ref_pic_poc_lsb_sps[]
+ */
+ WORD8 ai1_lt_ref_pic_poc_lsb_sps[MAX_LTREF_PICS_SPS];
+
+ /**
+ * used_by_curr_pic_lt_sps_flag[]
+ */
+ WORD8 ai1_used_by_curr_pic_lt_sps_flag[MAX_LTREF_PICS_SPS];
+
+ /**
+ * sps_temporal_mvp_enable_flag
+ */
+ WORD8 i1_sps_temporal_mvp_enable_flag;
+
+ /**
+ * strong_intra_smoothing_enable_flag
+ */
+ WORD8 i1_strong_intra_smoothing_enable_flag;
+
+ /**
+ * vui_parameters_present_flag
+ */
+ WORD8 i1_vui_parameters_present_flag;
+
+ /**
+ * vui parameters Structure info
+ */
+ vui_t s_vui_parameters;
+
+ /**
+ * Log2(CTB Size) in luma units
+ */
+
+ WORD8 i1_log2_ctb_size;
+
+ /**
+ * Maximum transform block size
+ */
+ WORD8 i1_log2_max_transform_block_size;
+
+ /**
+ * Picture width in CTB units
+ */
+
+ WORD16 i2_pic_wd_in_ctb;
+
+ /**
+ * Picture height in CTB units
+ */
+
+ WORD16 i2_pic_ht_in_ctb;
+
+ /**
+ * Picture width in min CB units
+ */
+
+ WORD16 i2_pic_wd_in_min_cb;
+
+ /**
+ * Picture height in min CB units
+ */
+
+ WORD16 i2_pic_ht_in_min_cb;
+
+ /**
+ * Picture size in CTB units
+ */
+ WORD32 i4_pic_size_in_ctb;
+
+ /**
+ * Profile, Tier and Level info
+ */
+
+ profile_tier_lvl_info_t s_ptl;
+
+ /**
+ * Short term reference pic set
+ */
+ stref_picset_t as_stref_picset[MAX_STREF_PICS_SPS];
+
+ /**
+ * Pointer to scaling matrix
+ */
+ /*************************************************************************/
+ /* Contanis the matrice in the following order in a 1D buffer */
+ /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V */
+ /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V */
+ /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V */
+ /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V */
+ /* Intra 16x16 Y, 16x16 U, 16x16 V */
+ /* Inter 16x16 Y, 16x16 U, 16x16 V */
+ /* Intra 32x32 Y */
+ /* Inter 32x32 Y */
+ /*************************************************************************/
+ WORD16 *pi2_scaling_mat;
+
+ /*
+ * Flag indicating if the SPS is parsed
+ */
+ WORD8 i1_sps_valid;
+
+}sps_t;
+
+/**
+ * Structure to hold PPS info
+ */
+typedef struct
+{
+ /**
+ * Pointer to scaling matrix
+ */
+ /*************************************************************************/
+ /* Contanis the matrice in the following order in a 1D buffer */
+ /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V */
+ /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V */
+ /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V */
+ /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V */
+ /* Intra 16x16 Y, 16x16 U, 16x16 V */
+ /* Inter 16x16 Y, 16x16 U, 16x16 V */
+ /* Intra 32x32 Y */
+ /* Inter 32x32 Y */
+ /*************************************************************************/
+ WORD16 *pi2_scaling_mat;
+
+ /**
+ * Pointer to an array containing tile info such as position, width, height
+ * of each tile
+ */
+
+ /* column_width_minus1[ i ] and row_height_minus1[ i ] */
+ tile_t *ps_tile;
+
+ /**
+ * pic_parameter_set_id
+ */
+ WORD8 i1_pps_id;
+
+ /**
+ * seq_parameter_set_id
+ */
+ WORD8 i1_sps_id;
+
+ /**
+ * sign_data_hiding_flag
+ */
+ WORD8 i1_sign_data_hiding_flag;
+
+ /**
+ * cabac_init_present_flag
+ */
+ WORD8 i1_cabac_init_present_flag;
+
+ /**
+ * num_ref_idx_l0_default_active_minus1
+ */
+ WORD8 i1_num_ref_idx_l0_default_active;
+
+ /**
+ * num_ref_idx_l1_default_active_minus1
+ */
+ WORD8 i1_num_ref_idx_l1_default_active;
+
+ /**
+ * pic_init_qp_minus26
+ */
+ WORD8 i1_pic_init_qp;
+
+ /**
+ * constrained_intra_pred_flag
+ */
+ WORD8 i1_constrained_intra_pred_flag;
+
+ /**
+ * transform_skip_enabled_flag
+ */
+ WORD8 i1_transform_skip_enabled_flag;
+
+ /**
+ * cu_qp_delta_enabled_flag
+ */
+ WORD8 i1_cu_qp_delta_enabled_flag;
+
+ /**
+ * diff_cu_qp_delta_depth
+ */
+ WORD8 i1_diff_cu_qp_delta_depth;
+
+ /**
+ * pic_cb_qp_offset
+ */
+ WORD8 i1_pic_cb_qp_offset;
+
+ /**
+ * pic_cr_qp_offset
+ */
+ WORD8 i1_pic_cr_qp_offset;
+
+ /**
+ * pic_slice_level_chroma_qp_offsets_present_flag
+ */
+ WORD8 i1_pic_slice_level_chroma_qp_offsets_present_flag;
+
+ /**
+ * weighted_pred_flag
+ */
+ WORD8 i1_weighted_pred_flag;
+
+ /**
+ * weighted_bipred_flag
+ */
+ WORD8 i1_weighted_bipred_flag;
+
+ /**
+ * output_flag_present_flag
+ */
+ WORD8 i1_output_flag_present_flag;
+
+ /**
+ * transquant_bypass_enable_flag
+ */
+ WORD8 i1_transquant_bypass_enable_flag;
+
+ /**
+ * dependent_slice_enabled_flag
+ */
+ WORD8 i1_dependent_slice_enabled_flag;
+
+ /**
+ * tiles_enabled_flag
+ */
+ WORD8 i1_tiles_enabled_flag;
+
+ /**
+ * entropy_coding_sync_enabled_flag
+ */
+ WORD8 i1_entropy_coding_sync_enabled_flag;
+
+ /**
+ * entropy_slice_enabled_flag
+ */
+ WORD8 i1_entropy_slice_enabled_flag;
+
+ /**
+ * num_tile_columns_minus1
+ */
+ WORD8 i1_num_tile_columns;
+
+ /**
+ * num_tile_rows_minus1
+ */
+ WORD8 i1_num_tile_rows;
+
+ /**
+ * uniform_spacing_flag
+ */
+ WORD8 i1_uniform_spacing_flag;
+
+ /**
+ * loop_filter_across_tiles_enabled_flag
+ */
+ WORD8 i1_loop_filter_across_tiles_enabled_flag;
+
+ /**
+ * loop_filter_across_slices_enabled_flag
+ */
+ WORD8 i1_loop_filter_across_slices_enabled_flag;
+
+ /**
+ * deblocking_filter_control_present_flag
+ */
+ WORD8 i1_deblocking_filter_control_present_flag;
+
+ /**
+ * deblocking_filter_override_enabled_flag
+ */
+ WORD8 i1_deblocking_filter_override_enabled_flag;
+
+ /**
+ * pic_disable_deblocking_filter_flag
+ */
+ WORD8 i1_pic_disable_deblocking_filter_flag;
+
+ /**
+ * beta_offset_div2
+ */
+ WORD8 i1_beta_offset_div2;
+
+ /**
+ * tc_offset_div2
+ */
+ WORD8 i1_tc_offset_div2;
+
+ /**
+ * pps_scaling_list_data_present_flag
+ */
+ WORD8 i1_pps_scaling_list_data_present_flag;
+
+ /**
+ * lists_modification_present_flag
+ */
+ WORD8 i1_lists_modification_present_flag;
+
+ /**
+ * num_extra_slice_header_bits
+ */
+ WORD8 i1_num_extra_slice_header_bits;
+
+ /**
+ * log2_parallel_merge_level_minus2
+ */
+ WORD8 i1_log2_parallel_merge_level;
+
+ /**
+ * slice_header_extension_present_flag
+ */
+ WORD8 i1_slice_header_extension_present_flag;
+
+ /**
+ * slice_extension_present_flag
+ */
+ WORD8 i1_slice_extension_present_flag;
+
+ /**
+ * scaling_list_dc_coef_minus8
+ */
+ /*************************************************************************/
+ /* DC value of the scaling list */
+ /* Only 16 x 16 and 32 x 32 scaling lists have valid entries. */
+ /* Entries stored for all sizes for uniformity. */
+ /* Remaining will be initialized to default values if used */
+ /*************************************************************************/
+ UWORD8 au1_scaling_list_dc_coef[TOTAL_SCALE_MAT_COUNT];
+
+ /**
+ * Log2MinCuQpDeltaSize
+ */
+ WORD8 i1_log2_min_cu_qp_delta_size;
+
+ /*
+ * Flag indicating if the PPS is parsed
+ */
+ WORD8 i1_pps_valid;
+
+}pps_t;
+
+
+
+/**
+ * Buffering Period SEI parameters Info
+ */
+typedef struct
+{
+ /**
+ * specifies SPS Id active for the coded picture assosiated
+ * with the bp message.
+ */
+ UWORD8 u1_sps_id;
+
+ /**
+ * Derived from Hrd parameters
+ */
+ UWORD8 u1_sub_pic_cpb_params_present_flag;
+
+ /**
+ * specifies the presence of the initial_alt_cpb_removal_delay[ i ]
+ * and initial_alt_cpb_removal_offset[ i ] syntax elements
+ */
+ UWORD8 u1_rap_cpb_params_present_flag;
+
+ /**
+ * cbp removal delay used in buffering period SEI
+ */
+ UWORD32 cpb_delay_offset;
+
+ /**
+ * dbp removal delay used in buffering period SEI
+ */
+ UWORD32 dpb_delay_offset;
+
+ /**
+ * concatanation flag
+ */
+ UWORD8 concatenation_flag;
+
+ /**
+ * delata cbp removal delay
+ */
+ UWORD32 au_cpb_removal_delay_delta_minus1;
+
+ /**
+ * specify the default initial CPB removal delays, respectively,
+ * for the CPB when the NAL HRD parameters are in use
+ */
+ UWORD32 au4_nal_initial_cpb_removal_delay[32];
+
+ /**
+ * specify the alternate initial CPB removal delays, respectively,
+ * for the CPB when the NAL HRD parameters are in use
+ */
+ UWORD32 au4_nal_initial_alt_cpb_removal_delay[32];
+
+ /**
+ * specify the initial CPB removal delay offset, respectively,
+ * for the CPB when the NAL HRD parameters are in use
+ */
+ UWORD32 au4_nal_initial_cpb_removal_delay_offset[32];
+
+ /**
+ * specify the alternate initial CPB removal delays offsets, respectively,
+ * for the CPB when the NAL HRD parameters are in use
+ */
+ UWORD32 au4_nal_initial_alt_cpb_removal_delay_offset[32];
+
+ /**
+ * specify the default initial CPB removal delays, respectively,
+ * for the CPB when the VCL HRD parameters are in use
+ */
+ UWORD32 au4_vcl_initial_cpb_removal_delay[32];
+
+ /**
+ * specify the initial alt CPB removal delays , respectively,
+ * for the CPB when the VCL HRD parameters are in use
+ */
+ UWORD32 au4_vcl_initial_alt_cpb_removal_delay[32];
+
+ /**
+ * specify the initial CPB removal delay offset, respectively,
+ * for the CPB when the VCL HRD parameters are in use
+ */
+ UWORD32 au4_vcl_initial_cpb_removal_delay_offset[32];
+
+ /**
+ * specify the alternate initial CPB removal delays offsets, respectively,
+ * for the CPB when the VCL HRD parameters are in use
+ */
+ UWORD32 au4_vcl_initial_alt_cpb_removal_delay_offset[32];
+
+ /**
+ * Inital CPB removal delay
+ */
+ UWORD32 u4_initial_cpb_removal_delay_length;
+
+ /**
+ * CPB cnt for corr. sublayer
+ */
+ UWORD32 u4_cpb_cnt;
+
+
+ /**
+ * VBV buffer size used in buffering period SEI
+ */
+ UWORD32 u4_buffer_size_sei;
+
+ /**
+ * Encoder buffer fullness used in buffering period SEI
+ */
+ UWORD32 u4_ebf_sei;
+
+ /**
+ * target bitrate used in buffering period SEI
+ */
+ UWORD32 u4_target_bit_rate_sei;
+
+
+
+
+}buf_period_sei_params_t;
+
+
+/**
+ * Picture Timing SEI parameters Info
+ */
+typedef struct
+{
+ /**
+ * derived from vui parameters
+ */
+ UWORD8 u1_frame_field_info_present_flag;
+
+ /**
+ * indicates whether a picture should be displayed as a
+ * frame or as one or more fields
+ */
+ UWORD32 u4_pic_struct;
+
+ UWORD8 u1_num_clk_ticks;
+
+ /**
+ * indicates whether a scan-type of the pic should be interpreted
+ * as progressive or interlaced
+ */
+ UWORD8 u1_progressive_source_idc;
+
+ /**
+ * if 1, indicates if the current pic is a duplicte pic in output order
+ */
+ UWORD8 u1_duplicate_flag;
+
+ /**
+ * specifies the number clock ticks between the nominal CPB removal time
+ * au associated with the pt SEI message and
+ * the preceding au in decoding order that contained a bp SEI message
+ */
+ UWORD32 u4_au_cpb_removal_delay_minus1;
+
+ /**
+ * compute the DPB output time of the picture
+ */
+ UWORD32 u4_pic_dpb_output_delay;
+
+ UWORD32 u4_pic_dpb_output_du_delay;
+
+ /**
+ * specifies the number of decoding units in the access unit
+ * the picture timing SEI message is associated with
+ */
+ UWORD32 u4_num_decoding_units_minus1;
+
+ /**
+ * if 1 specifies that the du_common_cpb_removal_delay_increment_minus1 is present
+ */
+ UWORD32 u4_du_common_cpb_removal_delay_flag;
+
+ /**
+ * specifies the duration, in units of clock sub-ticks,
+ * between the nominal CPB removal times of any two consecutive decoding units
+ * in decoding order in the access unit associated with the pt_SEI message
+ */
+ UWORD32 u4_du_common_cpb_removal_delay_increment_minus1; //same as u4_du_cpb_removal_delay_increment_minus1
+
+ /**
+ * specifies the number of NAL units in the decoding unit of the access unit
+ * the picture timing SEI message is associated with.
+ * range from 0 to (pic size in ctby - 1)
+ */
+ UWORD32 u4_num_nalus_in_du_minus1;
+
+ /**
+ * specifies the duration, in units of clock sub-ticks,
+ * between the nominal CPB removal times of the ( i + 1 )-th decoding unit and the i-th decoding unit,
+ * in decoding order, in the access unit associated with the pt_SEI message
+ */
+ UWORD32 u4_du_cpb_removal_delay_increment_minus1;
+
+
+}pic_timing_sei_params_t;
+
+/**
+ * Structure to hold Recovery point SEI parameters Info
+ */
+typedef struct
+{
+ /**
+ * specifies the recovery point of output pictures in output order
+ */
+ WORD32 i4_recovery_poc_cnt;
+
+ UWORD8 u1_exact_match_flag;
+
+ /**
+ * indicates the presence or absence of a broken link in the NAL unit
+ * stream at the location of the recovery point SEI message
+ */
+
+ UWORD8 u1_broken_link_flag;
+
+}recovery_point_sei_params_t;
+/**
+ * Structure to hold active parameter parameter set SEI parameters Info
+ */
+typedef struct
+{
+ /*
+ * active vps id
+ */
+
+ UWORD8 u1_active_video_parameter_set_id;
+
+ /*
+ * default set to zero.
+ */
+ UWORD8 u1_self_contained_cvs_flag;
+
+ UWORD8 u1_no_parameter_set_update_flag;
+
+ UWORD8 u1_num_sps_ids_minus1;
+
+ /*
+ * active sps id
+ */
+ UWORD8 au1_active_seq_parameter_set_id[15];
+
+}active_parameter_set_sei_param_t;
+
+/**
+ * Structure to hold SEI parameters Info
+ */
+typedef struct
+{
+
+ WORD8 i1_sei_parameters_present_flag;
+
+ WORD8 i1_aud_present_flag;
+
+ WORD8 i1_buf_period_params_present_flag;
+
+ WORD8 i1_pic_timing_params_present_flag;
+
+ WORD8 i1_recovery_point_params_present_flag;
+
+ buf_period_sei_params_t s_buf_period_sei_params;
+
+ pic_timing_sei_params_t s_pic_timing_sei_params;
+
+ recovery_point_sei_params_t s_recovery_point_params;
+
+ active_parameter_set_sei_param_t s_active_parameter_set_sei_params;
+
+
+}sei_params_t;
+
+
+
+/**
+ * Structure to hold slice header info
+ */
+typedef struct
+{
+ /**
+ * entry_point_offset[ i ]
+ */
+ WORD32 *pi4_entry_point_offset;
+
+ /**
+ * poc_lsb_lt[ i ]
+ */
+ WORD32 ai4_poc_lsb_lt[MAX_DPB_SIZE];
+
+ /**
+ * slice_header_extension_length
+ */
+ WORD16 i2_slice_header_extension_length;
+
+ /**
+ * slice_address
+ */
+ WORD16 i2_slice_address;
+
+ /**
+ * first_slice_in_pic_flag
+ */
+ WORD8 i1_first_slice_in_pic_flag;
+
+ /* PPS id */
+ WORD8 i1_pps_id;
+ /**
+ * no_output_of_prior_pics_flag
+ */
+ WORD8 i1_no_output_of_prior_pics_flag;
+
+ /**
+ * dependent_slice_flag
+ */
+ WORD8 i1_dependent_slice_flag;
+
+ /**
+ * slice_type
+ */
+ WORD8 i1_slice_type;
+
+ /**
+ * pic_output_flag
+ */
+ WORD8 i1_pic_output_flag;
+
+ /**
+ * colour_plane_id
+ */
+ WORD8 i1_colour_plane_id;
+
+ /**
+ * pic_order_cnt_lsb
+ */
+ WORD32 i4_pic_order_cnt_lsb;
+
+ /**
+ * absolute pic_order_cnt
+ */
+ WORD32 i4_abs_pic_order_cnt;
+
+ /**
+ * short_term_ref_pic_set_sps_flag
+ */
+ WORD8 i1_short_term_ref_pic_set_sps_flag;
+
+ /**
+ * short_term_ref_pic_set_idx
+ */
+ WORD8 i1_short_term_ref_pic_set_idx;
+
+ /**
+ * num_long_term_sps
+ */
+ WORD8 i1_num_long_term_sps;
+
+ /**
+ * num_long_term_pics
+ */
+ WORD8 i1_num_long_term_pics;
+
+ /**
+ * lt_idx_sps[ i ]
+ */
+ WORD8 ai1_lt_idx_sps[MAX_DPB_SIZE];
+
+ /**
+ * used_by_curr_pic_lt_flag[ i ]
+ */
+ WORD8 ai1_used_by_curr_pic_lt_flag[MAX_DPB_SIZE];
+
+ /**
+ * delta_poc_msb_present_flag[ i ]
+ */
+ WORD8 ai1_delta_poc_msb_present_flag[MAX_DPB_SIZE];
+
+ /**
+ * delta_poc_msb_cycle_lt[ i ]
+ */
+ WORD8 ai1_delta_poc_msb_cycle_lt[MAX_DPB_SIZE];
+
+ /**
+ * slice_sao_luma_flag
+ */
+ WORD8 i1_slice_sao_luma_flag;
+
+ /**
+ * slice_sao_chroma_flag
+ */
+ WORD8 i1_slice_sao_chroma_flag;
+
+ /**
+ * slice_temporal_mvp_enable_flag
+ */
+ WORD8 i1_slice_temporal_mvp_enable_flag;
+
+ /**
+ * num_ref_idx_active_override_flag
+ */
+ WORD8 i1_num_ref_idx_active_override_flag;
+
+ /**
+ * num_ref_idx_l0_active_minus1
+ */
+ WORD8 i1_num_ref_idx_l0_active;
+
+ /**
+ * num_ref_idx_l1_active_minus1
+ */
+ WORD8 i1_num_ref_idx_l1_active;
+
+ /**
+ * mvd_l1_zero_flag
+ */
+ WORD8 i1_mvd_l1_zero_flag;
+
+ /**
+ * cabac_init_flag
+ */
+ WORD8 i1_cabac_init_flag;
+
+ /**
+ * collocated_from_l0_flag
+ */
+ WORD8 i1_collocated_from_l0_flag;
+
+ /**
+ * collocated_ref_idx
+ */
+ WORD8 i1_collocated_ref_idx;
+
+ /**
+ * five_minus_max_num_merge_cand
+ */
+ WORD8 i1_max_num_merge_cand;
+
+ /**
+ * slice_qp_delta
+ */
+ WORD8 i1_slice_qp_delta;
+
+ /**
+ * slice_cb_qp_offset
+ */
+ WORD8 i1_slice_cb_qp_offset;
+
+ /**
+ * slice_cr_qp_offset
+ */
+ WORD8 i1_slice_cr_qp_offset;
+
+ /**
+ * deblocking_filter_override_flag
+ */
+ WORD8 i1_deblocking_filter_override_flag;
+
+ /**
+ * slice_disable_deblocking_filter_flag
+ */
+ WORD8 i1_slice_disable_deblocking_filter_flag;
+
+ /**
+ * beta_offset_div2
+ */
+ WORD8 i1_beta_offset_div2;
+
+ /**
+ * tc_offset_div2
+ */
+ WORD8 i1_tc_offset_div2;
+
+ /**
+ * slice_loop_filter_across_slices_enabled_flag
+ */
+ WORD8 i1_slice_loop_filter_across_slices_enabled_flag;
+
+ /**
+ * NUmber of entry point offsets
+ */
+ WORD32 i4_num_entry_point_offsets;
+
+ /**
+ * offset_len_minus1
+ */
+ WORD8 i1_offset_len;
+
+ /**
+ * Entry point offsets
+ */
+ WORD32 *pu4_entry_point_offset;
+
+ /**
+ * Short term reference picture set
+ */
+ stref_picset_t s_stref_picset;
+
+ /**
+ * Weight and offset info for Weighted prediction
+ */
+ pred_wt_ofst_t s_wt_ofst;
+
+ /**
+ * Reference prediction list modification
+ */
+ rplm_t s_rplm;
+
+ /**
+ * First CTB' X pos : slice_address % i2_pic_wd_in_ctb
+ */
+ WORD16 i2_ctb_x;
+
+ /**
+ * First CTB' Y pos : slice_address / i2_pic_wd_in_ctb
+ */
+ WORD16 i2_ctb_y;
+
+ /**
+ * L0 Reference pic lists
+ */
+ ref_list_t as_ref_pic_list0[MAX_DPB_SIZE];
+
+ /**
+ * L1 Reference pic lists
+ */
+ ref_list_t as_ref_pic_list1[MAX_DPB_SIZE];
+
+ /**
+ * NAL unit type of the slice
+ */
+ WORD8 i1_nal_unit_type;
+
+ /**
+ * Low delay check flag
+ */
+ WORD8 i1_low_delay_flag;
+
+ /**
+ * The last independent slice's start ctb_x
+ * If the current slice is independent, it is the same as the current CTBs ctb_x
+ */
+ WORD16 i2_independent_ctb_x;
+
+ /**
+ * The last independent slice's start ctb_y
+ * If the current slice is independent, it is the same as the current CTBs ctb_y
+ */
+ WORD16 i2_independent_ctb_y;
+
+ UWORD8 u1_parse_data_init_done;
+
+}slice_header_t;
+
+
+#if 0
+
+typedef struct
+{
+
+ /* scaling_list_pred_mode_flag */
+ WORD8 i1_scaling_list_pred_mode_flag;
+
+ /* scaling_list_pred_matrix_id_delta */
+ WORD8 i1_scaling_list_pred_matrix_id_delta;
+
+}sld_t;
+
+typedef struct
+{
+ /* scaling_list_dc_coef_minus8[ sizeID - 2 ][ matrixID ] */
+ WORD8 i1_scaling_list_dc_coef[ sizeID - 2 ][ matrixID ];
+
+ /* scaling_list_delta_coef */
+ WORD8 i1_scaling_list_delta_coef;
+
+}slm_t;
+
+typedef struct
+{
+
+ /* last_payload_type_byte */
+ UWORD8 i1_last_payload_type_byte;
+
+ /* last_payload_size_byte */
+ UWORD8 last_payload_size_byte;
+}sei_t;
+
+typedef struct
+{
+ /* pic_type*/
+ WORD8 pic_type;
+}aud_t;
+
+typedef struct
+{
+ /* slice_extention_flag */
+ WORD8 i1_slice_extention_flag;
+
+ /* slice_extension_data_flag */
+ WORD8 i1_slice_extension_data_flag;
+
+}slr_t;
+
+typedef struct
+{
+ /* op_num_layer_id_values_minus1[ opIdx ] */
+ WORD8 i1_op_num_layer_id_values_minus1[VPS_MAX_HRD_PARAMS];
+
+ /* op_layer_id[ opIdx ][ i ] */
+ WORD8 i1_op_layer_id[VPS_MAX_HRD_PARAMS][VPS_MAX_OP_LAYERS];
+}op_point_t;
+
+
+typedef struct
+{
+}sds_t;
+
+#endif
+
+
+
+
+
+
+#endif /* _IHEVC_STRUCTS_H_ */
diff --git a/common/ihevc_tables_x86_intr.h b/common/ihevc_tables_x86_intr.h
new file mode 100644
index 0000000..4ded3ea
--- /dev/null
+++ b/common/ihevc_tables_x86_intr.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_tables_x86_intr.h
+*
+* @brief
+* Declarations for the fucntions defined in ihevc_intra_pred_filters
+*
+* @author
+* Mamatha
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_TABLES_X86_INTR_H_
+#define IHEVC_TABLES_X86_INTR_H_
+
+
+//Luma intra pred
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY1[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY2[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY3[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASK4[16];
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASK5[16];
+//Chroma intra pred
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY7[16];
+
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY8[16];
+
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY9[16];
+
+extern MEM_ALIGN16 const UWORD8 IHEVCE_SHUFFLEMASKY11[16];
+
+extern MEM_ALIGN16 const UWORD8 inv_angle_shuffle[7][32];
+// DEBLOCK TABLES
+extern MEM_ALIGN16 const WORD8 coef_d[16];
+extern MEM_ALIGN16 const WORD8 coef_de1[16];
+extern MEM_ALIGN16 const WORD8 coef_dep1[16];
+extern MEM_ALIGN16 const WORD32 shuffle_d[4];
+extern const WORD32 shuffle0[2];
+extern MEM_ALIGN16 const WORD32 shuffle1[4];
+extern MEM_ALIGN16 const WORD32 shuffle2[4];
+extern MEM_ALIGN16 const WORD32 shuffle3[4];
+
+extern MEM_ALIGN16 const WORD8 delta0[16];
+extern MEM_ALIGN16 const WORD8 delta1[16];
+extern MEM_ALIGN16 const WORD32 shuffle_uv[4];
+
+#endif /*IHEVC_TABLES_X86_INTR_H_*/
diff --git a/common/ihevc_trans.h b/common/ihevc_trans.h
new file mode 100644
index 0000000..45cc6b8
--- /dev/null
+++ b/common/ihevc_trans.h
@@ -0,0 +1,75 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_trans.h
+*
+* @brief
+* Functions declarations for forward transform
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_TRANS_H_
+#define _IHEVC_TRANS_H_
+
+typedef void ihevc_trans_4x4_ttype1_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_rows);
+typedef void ihevc_trans_4x4_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_rows);
+typedef void ihevc_trans_8x8_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_rows);
+typedef void ihevc_trans_16x16_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_rows);
+typedef void ihevc_trans_32x32_ft(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_shift,
+ WORD32 i4_zero_rows);
+
+ihevc_trans_4x4_ttype1_ft ihevc_trans_4x4_ttype1;
+ihevc_trans_4x4_ft ihevc_trans_4x4;
+ihevc_trans_8x8_ft ihevc_trans_8x8;
+ihevc_trans_16x16_ft ihevc_trans_16x16;
+ihevc_trans_32x32_ft ihevc_trans_32x32;
+
+
+#endif /*_IHEVC_TRANS_H_*/
diff --git a/common/ihevc_trans_macros.h b/common/ihevc_trans_macros.h
new file mode 100644
index 0000000..079784d
--- /dev/null
+++ b/common/ihevc_trans_macros.h
@@ -0,0 +1,182 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_trans_macros.h
+*
+* @brief
+* Macros used in the forward transform and inverse transform functions
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef IHEVC_TRANS_MACROS_H_
+#define IHEVC_TRANS_MACROS_H_
+
+#define QUANT(out, inp, quant_coeff, qp_div, log2_trans_size, q_add) \
+{ \
+ LWORD64 tmp; \
+ WORD32 sign; \
+ WORD32 bit_depth,transform_shift; \
+ WORD32 q_bits, quant_multiplier; \
+ \
+ /* q_bits and q_add calculation*/ \
+ /* To be moved outside in neon. To be computer once per transform call */ \
+ bit_depth = 8; \
+ transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size; \
+ quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */ \
+ q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier ; \
+ \
+ sign = (inp)<0 ? -1:1; \
+ \
+ tmp = (LWORD64)(abs(inp)); \
+ tmp = tmp * (quant_coeff); \
+ tmp = tmp + (((LWORD64)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)); \
+ tmp = tmp >> q_bits; \
+ \
+ tmp = tmp * sign; \
+ out = (WORD16) CLIP_S16(tmp); \
+} \
+
+#define QUANT_HBD(out, inp, quant_coeff, qp_div, log2_trans_size, q_add, bit_depth) \
+{ \
+ LWORD64 tmp; \
+ WORD32 sign; \
+ WORD32 transform_shift; \
+ WORD32 q_bits, quant_multiplier; \
+ \
+ /* q_bits and q_add calculation*/ \
+ /* To be moved outside in neon. To be computer once per transform call */ \
+ \
+ transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size; \
+ quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */ \
+ q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier ; \
+ \
+ sign = (inp)<0 ? -1:1; \
+ \
+ tmp = (LWORD64)(abs(inp)); \
+ tmp = tmp * (quant_coeff); \
+ tmp = tmp + (((LWORD64)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)); \
+ tmp = tmp >> q_bits; \
+ \
+ tmp = tmp * sign; \
+ out = (WORD16) CLIP_S16(tmp); \
+}
+/* added by 100028 */
+#define QUANT_NO_WEIGHTMAT(out, inp, quant_coeff, qp_div, log2_trans_size, q_add) \
+{ \
+ WORD32 tmp; \
+ WORD32 sign; \
+ WORD32 bit_depth,transform_shift; \
+ WORD32 q_bits, quant_multiplier; \
+ \
+ /* q_bits and q_add calculation*/ \
+ /* To be moved outside in neon. To be computer once per transform call */ \
+ bit_depth = 8; \
+ transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size; \
+ quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */ \
+ q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */; \
+ \
+ sign = (inp)<0 ? -1:1; \
+ \
+ tmp = (WORD32)(abs(inp)); \
+ tmp = tmp * (quant_coeff); \
+ tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)); \
+ tmp = tmp >> q_bits; \
+ \
+ tmp = tmp * sign; \
+ out = (WORD16) CLIP_S16(tmp); \
+}
+
+#define QUANT_NO_WEIGHTMAT_HBD(out, inp, quant_coeff, qp_div, log2_trans_size, q_add, bit_depth) \
+{ \
+ WORD32 tmp; \
+ WORD32 sign; \
+ WORD32 transform_shift; \
+ WORD32 q_bits, quant_multiplier; \
+ \
+ /* q_bits and q_add calculation*/ \
+ /* To be moved outside in neon. To be computer once per transform call */ \
+ \
+ transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size; \
+ quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */ \
+ q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */; \
+ \
+ sign = (inp)<0 ? -1:1; \
+ \
+ tmp = (WORD32)(abs(inp)); \
+ tmp = tmp * (quant_coeff); \
+ tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)); \
+ tmp = tmp >> q_bits; \
+ \
+ tmp = tmp * sign; \
+ out = (WORD16) CLIP_S16(tmp); \
+}
+/* Reference Inverse Quantization: "pi2_src"(Coefficients) will be clipped to 15 or 14 bits when (qp_div > shift_iq). Spec doesn't have any clip mentioned */
+
+/* Inverse quantization other than 4x4 */
+/* No clipping is needed for "pi2_src"(coefficients) */
+#define IQUANT(res, coeff /*pi2_src[index*src_strd]*/, dequant_coeff /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */, shift_iq, qp_div) \
+{ \
+ WORD32 tmp, add_iq; \
+ \
+ add_iq = SHL_NEG(1 , (shift_iq - qp_div - 1)); /* To be moved outside in neon. To be computed once per transform call */ \
+ \
+ tmp = coeff * dequant_coeff ; \
+ tmp = tmp + add_iq; \
+ tmp = SHR_NEG(tmp,(shift_iq - qp_div)); \
+ \
+ res = CLIP_S16(tmp); \
+}
+
+/* 4x4 inverse quantization */
+/* Options : */
+/* 1. Clip "pi2_src"(coefficients) to 10 bits if "(qp_div >= shift_iq)" or 16 bits if "(qp_div < shift_iq)"*/
+/* 2. Increasing precision of "pi2_src"(coefficients) to 64 bits */
+
+#define IQUANT_4x4(res, coeff /*pi2_src[index*src_strd]*/, dequant_coeff /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */, shift_iq, qp_div) \
+{ \
+ WORD32 clip_coeff, tmp; \
+ WORD32 coeff_min,coeff_max; \
+ WORD32 coeff_bit_range; \
+ WORD32 add_iq; \
+ add_iq = SHL_NEG(1 , (shift_iq - qp_div - 1)); /* To be moved outside in neon. To be computed once per transform call */ \
+ \
+ coeff_bit_range = 16; \
+ if(qp_div > shift_iq) \
+ coeff_bit_range = 10; \
+ \
+ coeff_min = -(1<<(coeff_bit_range-1)); \
+ coeff_max = (1<<(coeff_bit_range-1)) - 1; \
+ \
+ clip_coeff = CLIP3(coeff,coeff_min,coeff_max); \
+ \
+ tmp = clip_coeff * dequant_coeff ; \
+ tmp = tmp + add_iq; \
+ tmp = SHR_NEG(tmp,(shift_iq - qp_div)); \
+ \
+ res = CLIP_S16(tmp); \
+}
+
+#endif /* IHEVC_TRANS_MACROS_H_ */
diff --git a/common/ihevc_trans_tables.c b/common/ihevc_trans_tables.c
new file mode 100644
index 0000000..139699a
--- /dev/null
+++ b/common/ihevc_trans_tables.c
@@ -0,0 +1,926 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_trans_tables.c
+*
+* @brief
+* Contains tables used for forward and inverse transform
+*
+* @author
+* 100470
+*
+* @par List of Tables:
+* g_ihevc_iquant_scales
+* g_ihevc_iquant_intr_scales
+* g_ihevc_quant_scales
+* g_ai4_ihevc_trans_4_ttype1
+* g_ai4_ihevc_trans_4_ttype0
+* g_ai2_ihevc_trans_dst_4
+* g_ai4_ihevc_trans_dst_intr_4
+* g_ai2_ihevc_trans_4
+* g_ai2_ihevc_trans_4_transpose
+* g_ai4_ihevc_trans_4_intr
+* g_ai2_ihevc_trans_4_intr
+* g_ai2_ihevc_trans_8
+* g_ai2_ihevc_trans_8_transpose
+* g_ai4_ihevc_trans_8_intr
+* g_ai2_ihevc_trans_8_intr
+* g_ai4_ihevc_trans_intr_even_8
+* g_ai4_ihevc_trans_intr_odd_8
+* g_ai2_ihevc_trans_16
+* g_ai2_ihevc_trans_16_transpose
+* g_ai2_ihevc_trans_32_intr_8
+* g_ai4_ihevc_trans_16_even
+* g_ai4_ihevc_trans_16_odd
+* g_ai2_ihevc_trans_32_transpose
+* g_ai2_ihevc_trans_32
+* g_ai2_ihevc_trans_32_intr_16
+* g_ai2_ihevc_trans_16_intr_odd
+* g_ai2_ihevc_trans_16_intr_even
+* g_ai2_ihevc_trans_32_intr_even
+* g_ai2_ihevc_trans_32_intr_odd
+* g_ai2_ihevc_trans_16_even_packed
+* g_ai2_ihevc_trans_32_intr_packed
+* g_ai2_ihevc_trans_32_intr_odd_packed
+* g_ai2_ihevc_trans_16_even
+* g_ai2_ihevc_trans_16_odd
+* g_ai2_ihevc_trans_intr_even_8
+* g_ai2_ihevc_trans_intr_odd_8
+* g_ai2_ihevc_trans_intr_4
+* IHEVCE_CHROMA_SHUFFLEMASK_HBD
+* g_ai4_ihevc_trans_8_intr_avx2
+* g_ai2_ihevc_trans_8_intr_avx2
+* g_ai2_ihevc_trans_32_intr_8_avx2
+* g_ai2_ihevc_trans_32_intr_16_avx2
+* g_ai2_ihevc_trans_16_intr_odd_avx2
+* g_ai2_ihevc_trans_16_intr_even_avx2
+
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_macros.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_defs.h"
+
+const WORD32 g_ihevc_iquant_scales[6] =
+{
+ 40, 45, 51, 57, 64, 72
+};
+
+const WORD16 g_ihevc_iquant_intr_scales[6][8] =
+{
+ { 40, 40, 40, 40, 40, 40, 40, 40 },
+ { 45, 45, 45, 45, 45, 45, 45, 45 },
+ { 51, 51, 51, 51, 51, 51, 51, 51 },
+ { 57, 57, 57, 57, 57, 57, 57, 57 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 72, 72, 72, 72, 72, 72, 72, 72 }
+};
+
+const WORD32 g_ihevc_quant_scales[6] =
+{
+ 26214, 23302, 20560, 18396, 16384, 14564
+};
+
+//DST coeffs
+const WORD32 g_ai4_ihevc_trans_4_ttype1[3][4] =
+{
+ { 55, 55, 55, 55 },
+ { 29, 29, 29, 29 },
+ { 74, 74, 74, 74 }
+};
+
+//DCT coeffs
+const WORD32 g_ai4_ihevc_trans_4_ttype0[3][4] =
+{
+ { 36, 36, 36, 36 },
+ { 64, 64, 64, 64 },
+ { 83, 83, 83, 83 }
+};
+
+const WORD16 g_ai2_ihevc_trans_dst_4[4][4] =
+{
+ { 29, 55, 74, 84 },
+ { 74, 74, 0, -74 },
+ { 84, -29, -74, 55 },
+ { 55, -84, 74, -29 }
+};
+
+const WORD32 g_ai4_ihevc_trans_dst_intr_4[3][4] =
+{ /* 4*32 = 128 bit */
+ { 29, 29, 29, 29 },
+ { 55, 55, 55, 55 },
+ { 74, 74, 74, 74 }
+};
+
+const WORD16 g_ai2_ihevc_trans_4[4][4] =
+{
+ { 64, 64, 64, 64 },
+ { 83, 36, -36, -83 },
+ { 64, -64, -64, 64 },
+ { 36, -83, 83, -36 }
+};
+
+const WORD16 g_ai2_ihevc_trans_4_transpose[4][4] =
+{
+ { 64, 83, 64, 36 },
+ { 64, 36, -64, -83 },
+ { 64, -36, -64, 83 },
+ { 64, -83, 64, -36 }
+};
+
+const WORD32 g_ai4_ihevc_trans_4_intr[3][4] =
+{ /* 4*32 = 128 bit */
+ { 64, 64, 64, 64 },
+ { 83, 83, 83, 83 },
+ { 36, 36, 36, 36 }
+};
+
+const WORD16 g_ai2_ihevc_trans_4_intr[8] = { 64, 64, 83, 36, 64, -64, 36, -83 };
+
+
+const WORD16 g_ai2_ihevc_trans_8[8][8] =
+{
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 89, 75, 50, 18, -18, -50, -75, -89 },
+ { 83, 36, -36, -83, -83, -36, 36, 83 },
+ { 75, -18, -89, -50, 50, 89, 18, -75 },
+ { 64, -64, -64, 64, 64, -64, -64, 64 },
+ { 50, -89, 18, 75, -75, -18, 89, -50 },
+ { 36, -83, 83, -36, -36, 83, -83, 36 },
+ { 18, -50, 75, -89, 89, -75, 50, -18 }
+};
+
+/* Used by itrans_recon_8x8 */
+const WORD16 g_ai2_ihevc_trans_8_transpose[8][8] =
+{
+ { 64, 89, 83, 75, 64, 50, 36, 18 },
+ { 64, 75, 36, -18, -64, -89, -83, -50 },
+ { 64, 50, -36, -89, -64, 18, 83, 75 },
+ { 64, 18, -83, -50, 64, 75, -36, -89 },
+ { 64, -18, -83, 50, 64, -75, -36, 89 },
+ { 64, -50, -36, 89, -64, -18, 83, -75 },
+ { 64, -75, 36, 18, -64, 89, -83, 50 },
+ { 64, -89, 83, -75, 64, -50, 36, -18 }
+};
+
+const WORD32 g_ai4_ihevc_trans_8_intr[7][4] =
+{ /* 4*32 = 128 bit */
+ { 64, 64, 64, 64 },
+ { 83, 83, 83, 83 },
+ { 36, 36, 36, 36 },
+ { 75, 75, 75, 75 },
+ { 18, 18, 18, 18 },
+ { 89, 89, 89, 89 },
+ { 50, 50, 50, 50 },
+};
+
+
+const WORD16 g_ai2_ihevc_trans_8_intr[8][8] =
+{ /* 4*32 = 128 bit */
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 89, 75, 18, 50, 89, 75, 18, 50 },
+ { 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 75, -18, -50, -89, 75, -18, -50, -89 },
+ { 64, -64, 64, -64, 64, -64, 64, -64 },
+ { 50, -89, 75, 18, 50, -89, 75, 18 },
+ { 36, -83, 36, -83, 36, -83, 36, -83 },
+ { 18, -50, -89, 75, 18, -50, -89, 75 }
+};
+
+
+const WORD32 g_ai4_ihevc_trans_intr_even_8[3][4] =
+{
+ { 64, 64, 64, 64 },
+ { 83, 83, 83, 83 },
+ { 36, 36, 36, 36 },
+};
+
+const WORD32 g_ai4_ihevc_trans_intr_odd_8[4][4] =
+{
+ { 89, 89, 89, 89 },
+ { 75, 75, 75, 75 },
+ { 50, 50, 50, 50 },
+ { 18, 18, 18, 18 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16[16][16] =
+{
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 },
+ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
+ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 },
+ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
+ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 },
+ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
+ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 },
+ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
+ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 },
+ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
+ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 },
+ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
+ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 },
+ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
+ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_transpose[1][16] =
+{
+ { 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9 }
+};
+
+const WORD32 g_ai2_ihevc_trans_32_intr_8[8][4] =
+{ /* 4*32 = 128 bit */
+ { 90, 90, 90, 90 },
+ { 87, 87, 87, 87 },
+ { 80, 80, 80, 80 },
+ { 70, 70, 70, 70 },
+ { 57, 57, 57, 57 },
+ { 43, 43, 43, 43 },
+ { 25, 25, 25, 25 },
+ { 9, 9, 9, 9 }
+};
+
+const WORD32 g_ai4_ihevc_trans_16_even[7][4] =
+{
+ { 64, 64, 64, 64 },
+ { 89, 89, 89, 89 },
+ { 75, 75, 75, 75 },
+ { 83, 83, 83, 83 },
+ { 36, 36, 36, 36 },
+ { 18, 18, 18, 18 },
+ { 50, 50, 50, 50 },
+};
+
+const WORD32 g_ai4_ihevc_trans_16_odd[8][4] =
+{
+ { 90, 90, 90, 90 },
+ { 87, 87, 87, 87 },
+ { 80, 80, 80, 80 },
+ { 70, 70, 70, 70 },
+ { 57, 57, 57, 57 },
+ { 43, 43, 43, 43 },
+ { 25, 25, 25, 25 },
+ { 9, 9, 9, 9 }
+};
+
+const WORD16 g_ai2_ihevc_trans_32_transpose[1][32] =
+{
+ { 64, 90, 90, 90, 89, 88, 87, 85, 83, 82, 80, 78, 75, 73, 70, 67, 64, 61, 57, 54, 50, 46, 43, 38, 36, 31, 25, 22, 18, 13, 9, 4 }
+};
+const WORD16 g_ai2_ihevc_trans_32[32][32] =
+{
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
+ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 },
+ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 },
+ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
+ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 },
+ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 },
+ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 },
+ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
+ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 },
+ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 },
+ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 },
+ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
+ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 },
+ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 },
+ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 },
+ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
+ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 },
+ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 },
+ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 },
+ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
+ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 },
+ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 },
+ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 },
+ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
+ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 },
+ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 },
+ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 },
+ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
+ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 },
+ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 },
+ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }
+};
+
+
+
+const WORD32 g_ai2_ihevc_trans_32_intr_16[15][4] =
+{ /* 4*32 = 128 bit */
+ { 90, 90, 90, 90 },
+ { 88, 88, 88, 88 },
+ { 85, 85, 85, 85 },
+ { 82, 82, 82, 82 },
+ { 78, 78, 78, 78 },
+ { 73, 73, 73, 73 },
+ { 67, 67, 67, 67 },
+ { 61, 61, 61, 61 },
+ { 54, 54, 54, 54 },
+ { 46, 46, 46, 46 },
+ { 38, 38, 38, 38 },
+ { 31, 31, 31, 31 },
+ { 22, 22, 22, 22 },
+ { 13, 13, 13, 13 },
+ { 4, 4, 4, 4 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_intr_odd[32][8] =
+{
+ { 90, 87, 90, 87, 90, 87, 90, 87 },
+ { 70, 80, 70, 80, 70, 80, 70, 80 },
+ { 57, 43, 57, 43, 57, 43, 57, 43 },
+ { 9, 25, 9, 25, 9, 25, 9, 25 },
+ { 87, 57, 87, 57, 87, 57, 87, 57 },
+ { -43, 9, -43, 9, -43, 9, -43, 9 },
+ { -80, -90, -80, -90, -80, -90, -80, -90 },
+ { -25, -70, -25, -70, -25, -70, -25, -70 },
+ { 80, 9, 80, 9, 80, 9, 80, 9 },
+ { -87, -70, -87, -70, -87, -70, -87, -70 },
+ { -25, 57, -25, 57, -25, 57, -25, 57 },
+ { 43, 90, 43, 90, 43, 90, 43, 90 },
+ { 70, -43, 70, -43, 70, -43, 70, -43 },
+ { 9, -87, 9, -87, 9, -87, 9, -87 },
+ { 90, 25, 90, 25, 90, 25, 90, 25 },
+ { -57, -80, -57, -80, -57, -80, -57, -80 },
+ { 57, -80, 57, -80, 57, -80, 57, -80 },
+ { 90, -25, 90, -25, 90, -25, 90, -25 },
+ { -9, -87, -9, -87, -9, -87, -9, -87 },
+ { 70, 43, 70, 43, 70, 43, 70, 43 },
+ { 43, -90, 43, -90, 43, -90, 43, -90 },
+ { 25, 57, 25, 57, 25, 57, 25, 57 },
+ { -87, 70, -87, 70, -87, 70, -87, 70 },
+ { -80, 9, -80, 9, -80, 9, -80, 9 },
+ { 25, -70, 25, -70, 25, -70, 25, -70 },
+ { -80, 90, -80, 90, -80, 90, -80, 90 },
+ { 43, 9, 43, 9, 43, 9, 43, 9 },
+ { 87, -57, 87, -57, 87, -57, 87, -57 },
+ { 9, -25, 9, -25, 9, -25, 9, -25 },
+ { -57, 43, -57, 43, -57, 43, -57, 43 },
+ { 70, -80, 70, -80, 70, -80, 70, -80 },
+ { -90, 87, -90, 87, -90, 87, -90, 87 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_intr_even[12][8] =
+{
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 89, 75, 89, 75, 89, 75, 89, 75 },
+ { 18, 50, 18, 50, 18, 50, 18, 50 },
+ { 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 75, -18, 75, -18, 75, -18, 75, -18 },
+ { -50, -89, -50, -89, -50, -89, -50, -89 },
+ { 64, -64, 64, -64, 64, -64, 64, -64 },
+ { 50, -89, 50, -89, 50, -89, 50, -89 },
+ { 75, 18, 75, 18, 75, 18, 75, 18 },
+ { 36, -83, 36, -83, 36, -83, 36, -83 },
+ { 18, -50, 18, -50, 18, -50, 18, -50 },
+ { -89, 75, -89, 75, -89, 75, -89, 75 }
+};
+
+
+const WORD16 g_ai2_ihevc_trans_32_intr_even[22][8] =
+{
+ { 64, 64, 64, 64, 83, 36, 83, 36 },
+ { 64, -64, 64, -64, 36, -83, 36, -83 },
+ { 89, 18, 89, 18, 75, 50, 75, 50 },
+ { 75, -50, 75, -50, -18, -89, -18, -89 },
+ { 50, 75, 50, 75, -89, 18, -89, 18 },
+ { 18, -89, 18, -89, -50, 75, -50, 75 },
+
+ { 90, 70, 90, 70, 87, 80, 87, 80 },
+ { 9, 57, 9, 57, 25, 43, 25, 43 },
+ { 87, -43, 87, -43, 57, 9, 57, 9 },
+ { -25, -80, -25, -80, -70, -90, -70, -90 },
+ { 80, -87, 80, -87, 9, -70, 9, -70 },
+ { 43, -25, 43, -25, 90, 57, 90, 57 },
+ { 70, 9, 70, 9, -43, -87, -43, -87 },
+ { -57, 90, -57, 90, -80, 25, -80, 25 },
+ { 57, 90, 57, 90, -80, -25, -80, -25 },
+ { 70, -9, 70, -9, 43, -87, 43, -87 },
+ { 43, 25, 43, 25, -90, 57, -90, 57 },
+ { -80, -87, -80, -87, 9, 70, 9, 70 },
+ { 25, -80, 25, -80, -70, 90, -70, 90 },
+ { 87, 43, 87, 43, -57, 9, -57, 9 },
+ { 9, -57, 9, -57, -25, 43, -25, 43 },
+ { -90, 70, -90, 70, 87, -80, 87, -80 }
+};
+
+
+const WORD16 g_ai2_ihevc_trans_32_intr_odd[32][16] =
+{
+ { 90, 85, 90, 85, 90, 88, 90, 88, 61, 82, 61, 82, -73, -46, -73, -46 },
+ { 67, 82, 67, 82, 73, 78, 73, 78, 90, 31, 90, 31, -13, -88, -13, -88 },
+ { 61, 38, 61, 38, 54, 46, 54, 46, -4, 85, -4, 85, -90, 22, -90, 22 },
+ { 4, 31, 4, 31, 13, 22, 13, 22, 67, -38, 67, -38, 54, -78, 54, -78 },
+
+ { 90, 46, 90, 46, 82, 67, 82, 67, 54, 88, 54, 88, -85, -4, -85, -4 },
+ { -54, 22, -54, 22, -31, -4, -31, -4, 13, -46, 13, -46, 82, -61, 82, -61 },
+ { -73, -88, -73, -88, -85, -90, -85, -90, -90, -78, -90, -78, 38, 67, 38, 67 },
+ { -13, -78, -13, -78, -38, -61, -38, -61, -73, -22, -73, -22, -31, 90, -31, 90 },
+
+ { 88, -13, 88, -13, 67, 31, 67, 31, 46, 54, 46, 54, -90, 38, -90, 38 },
+ { -78, -54, -78, -54, -90, -82, -90, -82, -88, -90, -88, -90, 61, 31, 61, 31 },
+ { -46, 73, -46, 73, -4, 38, -4, 38, 22, 13, 22, 13, 67, -85, 67, -85 },
+ { 22, 90, 22, 90, 61, 85, 61, 85, 78, 73, 78, 73, 4, -82, 4, -82 },
+
+ { 85, -67, 85, -67, 46, -13, 46, -13, 38, -4, 38, -4, -88, 73, -88, 73 },
+ { 38, -90, 38, -90, -22, -73, -22, -73, -31, -67, -31, -67, -46, 90, -46, 90 },
+ { 82, -4, 82, -4, 88, 54, 88, 54, 85, 61, 85, 61, -78, 13, -78, 13 },
+ { -31, -61, -31, -61, -78, -90, -78, -90, -82, -90, -82, -90, 22, 54, 22, 54 },
+
+ { 82, -90, 82, -90, 22, -54, 22, -54, 31, -61, 31, -61, -78, 90, -78, 90 },
+ { 85, -61, 85, -61, 78, 13, 78, 13, 82, 4, 82, 4, -88, 54, -88, 54 },
+ { 31, -67, 31, -67, -46, -90, -46, -90, -38, -90, -38, -90, -22, 73, -22, 73 },
+ { 38, 4, 38, 4, 88, 73, 88, 73, 85, 67, 85, 67, -46, -13, -46, -13 },
+
+ { 78, -73, 78, -73, -4, -82, -4, -82, 22, -90, 22, -90, -61, 85, -61, 85 },
+ { -22, 13, -22, 13, 67, 85, 67, 85, 46, 73, 46, 73, -4, -38, -4, -38 },
+ { -88, 90, -88, 90, -61, 31, -61, 31, -78, 54, -78, 54, 90, -82, 90, -82 },
+ { -46, 54, -46, 54, -90, -38, -90, -38, -88, -13, -88, -13, 67, -31, 67, -31 },
+
+ { 73, -22, 73, -22, -31, -90, -31, -90, 13, -78, 13, -78, -38, 61, -38, 61 },
+ { -90, 78, -90, 78, -38, 67, -38, 67, -73, 88, -73, 88, 85, -90, 85, -90 },
+ { -13, -46, -13, -46, 82, 61, 82, 61, 54, 22, 54, 22, -31, 4, -31, 4 },
+ { 54, -88, 54, -88, 85, -4, 85, -4, 90, -46, 90, -46, -82, 67, -82, 67 },
+
+ { 67, 38, 67, 38, -54, -78, -54, -78, 4, -31, 4, -31, -13, 22, -13, 22 },
+ { 4, 85, 4, 85, -90, -22, -90, -22, -61, 38, -61, 38, 54, -46, 54, -46 },
+ { 90, -31, 90, -31, 13, -88, 13, -88, 67, -82, 67, -82, -73, 78, -73, 78 },
+ { -61, 82, -61, 82, -73, 46, -73, 46, -90, 85, -90, 85, 90, -88, 90, -88 }
+
+};
+
+
+/*Tables for itrans_recon functions*/
+const WORD16 g_ai2_ihevc_trans_16_even_packed[12][8] =
+{
+ { 83, 36, 83, 36, 83, 36, 83, 36 },
+
+ { 36, -83, 36, -83, 36, -83, 36, -83 },
+
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+
+ { 64, -64, 64, -64, 64, -64, 64, -64 },
+
+ { 89, 75, 89, 75, 89, 75, 89, 75 },
+
+ { 50, 18, 50, 18, 50, 18, 50, 18 },
+
+ { 75, -18, 75, -18, 75, -18, 75, -18 },
+
+ { 89, 50, 89, 50, 89, 50, 89, 50 },
+
+ { 50, -89, 50, -89, 50, -89, 50, -89 },
+
+ { 18, 75, 18, 75, 18, 75, 18, 75 },
+
+ { 18, -50, 18, -50, 18, -50, 18, -50 },
+
+ { 75, -89, 75, -89, 75, -89, 75, -89 },
+
+
+};
+
+const WORD16 g_ai2_ihevc_trans_32_intr_packed[32][8] =
+{
+ { 90, 87, 90, 87, 90, 87, 90, 87 },
+
+ { 80, 70, 80, 70, 80, 70, 80, 70 },
+
+ { 57, 43, 57, 43, 57, 43, 57, 43 },
+
+ { 25, 9, 25, 9, 25, 9, 25, 9 },
+
+ { 87, 57, 87, 57, 87, 57, 87, 57 },
+
+ { 9, -43, 9, -43, 9, -43, 9, -43 },
+
+ { 80, 90, 80, 90, 80, 90, 80, 90 },
+
+ { 70, 25, 70, 25, 70, 25, 70, 25 },
+
+ { 80, 9, 80, 9, 80, 9, 80, 9 },
+
+ { 70, 87, 70, 87, 70, 87, 70, 87 },
+
+ { -25, 57, -25, 57, -25, 57, -25, 57 },
+
+ { 90, 43, 90, 43, 90, 43, 90, 43 },
+
+ { 70, -43, 70, -43, 70, -43, 70, -43 },
+
+ { -87, 9, -87, 9, -87, 9, -87, 9 },
+
+ { 90, 25, 90, 25, 90, 25, 90, 25 },
+
+ { 80, 57, 80, 57, 80, 57, 80, 57 },
+
+ { 57, -80, 57, -80, 57, -80, 57, -80 },
+
+ { -25, 90, -25, 90, -25, 90, -25, 90 },
+
+ { 9, 87, 9, 87, 9, 87, 9, 87 },
+
+ { 43, 70, 43, 70, 43, 70, 43, 70 },
+
+ { 43, -90, 43, -90, 43, -90, 43, -90 },
+
+ { 57, 25, 57, 25, 57, 25, 57, 25 },
+
+ { -87, 70, -87, 70, -87, 70, -87, 70 },
+
+ { 9, -80, 9, -80, 9, -80, 9, -80 },
+
+ { 25, -70, 25, -70, 25, -70, 25, -70 },
+
+ { 90, -80, 90, -80, 90, -80, 90, -80 },
+
+ { 43, 9, 43, 9, 43, 9, 43, 9 },
+
+ { -57, 87, -57, 87, -57, 87, -57, 87 },
+
+ { 9, -25, 9, -25, 9, -25, 9, -25 },
+
+ { 43, -57, 43, -57, 43, -57, 43, -57 },
+
+ { 70, -80, 70, -80, 70, -80, 70, -80 },
+
+ { 87, -90, 87, -90, 87, -90, 87, -90 },
+
+};
+
+const WORD16 g_ai2_ihevc_trans_32_intr_odd_packed[128][8] =
+{
+ /*o0*/
+ { 90, 90, 90, 90, 90, 90, 90, 90 },
+ { 88, 85, 88, 85, 88, 85, 88, 85 },
+ { 82, 78, 82, 78, 82, 78, 82, 78 },
+ { 73, 67, 73, 67, 73, 67, 73, 67 },
+ { 61, 54, 61, 54, 61, 54, 61, 54 },
+ { 46, 38, 46, 38, 46, 38, 46, 38 },
+ { 31, 22, 31, 22, 31, 22, 31, 22 },
+ { 13, 4, 13, 4, 13, 4, 13, 4 },
+
+ /*o1*/
+
+ { 90, 82, 90, 82, 90, 82, 90, 82 },
+ { 67, 46, 67, 46, 67, 46, 67, 46 },
+ { -22, 4, -22, 4, -22, 4, -22, 4 },
+ { 31, 54, 31, 54, 31, 54, 31, 54 },
+ { 73, 85, 73, 85, 73, 85, 73, 85 },
+ { 90, 88, 90, 88, 90, 88, 90, 88 },
+ { 78, 61, 78, 61, 78, 61, 78, 61 },
+ { 38, 13, 38, 13, 38, 13, 38, 13 },
+
+ /*o2*/
+ { 88, 67, 88, 67, 88, 67, 88, 67 },
+ { -31, 13, -31, 13, -31, 13, -31, 13 },
+ { 54, 82, 54, 82, 54, 82, 54, 82 },
+ { 90, 78, 90, 78, 90, 78, 90, 78 },
+ { 46, 4, 46, 4, 46, 4, 46, 4 },
+ { 38, 73, 38, 73, 38, 73, 38, 73 },
+ { 90, 85, 90, 85, 90, 85, 90, 85 },
+ { 61, 22, 61, 22, 61, 22, 61, 22 },
+
+ /*o3*/
+ { 85, 46, 85, 46, 85, 46, 85, 46 },
+ { 13, 67, 13, 67, 13, 67, 13, 67 },
+ { 90, 73, 90, 73, 90, 73, 90, 73 },
+ { 22, -38, 22, -38, 22, -38, 22, -38 },
+ { 82, 88, 82, 88, 82, 88, 82, 88 },
+ { -54, 4, -54, 4, -54, 4, -54, 4 },
+ { 61, 90, 61, 90, 61, 90, 61, 90 },
+ { 78, 31, 78, 31, 78, 31, 78, 31 },
+
+ /*o4*/
+ { -82, -22, -82, -22, -82, -22, -82, -22 },
+ { 54, 90, 54, 90, 54, 90, 54, 90 },
+ { 61, -13, 61, -13, 61, -13, 61, -13 },
+ { -78, -85, -78, -85, -78, -85, -78, -85 },
+ { -31, 46, -31, 46, -31, 46, -31, 46 },
+ { 90, 67, 90, 67, 90, 67, 90, 67 },
+ { -4, -73, -4, -73, -4, -73, -4, -73 },
+ { -88, -38, -88, -38, -88, -38, -88, -38 },
+
+ /*o5*/
+ { -78, 4, -78, 4, -78, 4, -78, 4 },
+ { 82, 73, 82, 73, 82, 73, 82, 73 },
+ { -13, -85, -13, -85, -13, -85, -13, -85 },
+ { -67, 22, -67, 22, -67, 22, -67, 22 },
+ { 88, 61, 88, 61, 88, 61, 88, 61 },
+ { -31, -90, -31, -90, -31, -90, -31, -90 },
+ { -54, 38, -54, 38, -54, 38, -54, 38 },
+ { 90, 46, 90, 46, 90, 46, 90, 46 },
+
+ /*o6*/
+ { -73, 31, -73, 31, -73, 31, -73, 31 },
+ { 90, 22, 90, 22, 90, 22, 90, 22 },
+ { -78, -67, -78, -67, -78, -67, -78, -67 },
+ { 38, 90, 38, 90, 38, 90, 38, 90 },
+ { 13, -82, 13, -82, 13, -82, 13, -82 },
+ { -61, 46, -61, 46, -61, 46, -61, 46 },
+ { 88, 4, 88, 4, 88, 4, 88, 4 },
+ { -85, -54, -85, -54, -85, -54, -85, -54 },
+
+ /*o7*/
+ { -67, 54, -67, 54, -67, 54, -67, 54 },
+ { 78, -38, 78, -38, 78, -38, 78, -38 },
+ { -85, 22, -85, 22, -85, 22, -85, 22 },
+ { 90, -4, 90, -4, 90, -4, 90, -4 },
+ { -90, -13, -90, -13, -90, -13, -90, -13 },
+ { 88, 31, 88, 31, 88, 31, 88, 31 },
+ { -82, -46, -82, -46, -82, -46, -82, -46 },
+ { 73, 61, 73, 61, 73, 61, 73, 61 },
+
+ /*o8*/
+ { -61, 73, -61, 73, -61, 73, -61, 73 },
+ { 46, -82, 46, -82, 46, -82, 46, -82 },
+ { -31, 88, -31, 88, -31, 88, -31, 88 },
+ { 13, -90, 13, -90, 13, -90, 13, -90 },
+ { 4, 90, 4, 90, 4, 90, 4, 90 },
+ { -22, -85, -22, -85, -22, -85, -22, -85 },
+ { 38, 78, 38, 78, 38, 78, 38, 78 },
+ { -54, -67, -54, -67, -54, -67, -54, -67 },
+
+ /*o9*/
+ { -54, 85, -54, 85, -54, 85, -54, 85 },
+ { 4, -88, 4, -88, 4, -88, 4, -88 },
+ { 46, 61, 46, 61, 46, 61, 46, 61 },
+ { -82, -13, -82, -13, -82, -13, -82, -13 },
+ { 90, -38, 90, -38, 90, -38, 90, -38 },
+ { -67, 78, -67, 78, -67, 78, -67, 78 },
+ { 22, -90, 22, -90, 22, -90, 22, -90 },
+ { 31, 73, 31, 73, 31, 73, 31, 73 },
+
+ /*o10*/
+ { -46, 90, -46, 90, -46, 90, -46, 90 },
+ { -38, -54, -38, -54, -38, -54, -38, -54 },
+ { 90, -31, 90, -31, 90, -31, 90, -31 },
+ { -61, 88, -61, 88, -61, 88, -61, 88 },
+ { -22, -67, -22, -67, -22, -67, -22, -67 },
+ { 85, -13, 85, -13, 85, -13, 85, -13 },
+ { -73, 82, -73, 82, -73, 82, -73, 82 },
+ { -4, -78, -4, -78, -4, -78, -4, -78 },
+
+ /*o11*/
+ { -38, 88, -38, 88, -38, 88, -38, 88 },
+ { -73, 4, -73, 4, -73, 4, -73, 4 },
+ { 67, -90, 67, -90, 67, -90, 67, -90 },
+ { 46, 31, 46, 31, 46, 31, 46, 31 },
+ { -85, 78, -85, 78, -85, 78, -85, 78 },
+ { -13, -61, -13, -61, -13, -61, -13, -61 },
+ { 90, -54, 90, -54, 90, -54, 90, -54 },
+ { -22, 82, -22, 82, -22, 82, -22, 82 },
+
+
+ /*012*/
+ { -31, 78, -31, 78, -31, 78, -31, 78 },
+ { -90, 61, -90, 61, -90, 61, -90, 61 },
+ { -4, -54, -4, -54, -4, -54, -4, -54 },
+ { 88, -82, 88, -82, 88, -82, 88, -82 },
+ { 38, 22, 38, 22, 38, 22, 38, 22 },
+ { -73, 90, -73, 90, -73, 90, -73, 90 },
+ { -67, 13, -67, 13, -67, 13, -67, 13 },
+ { 46, -85, 46, -85, 46, -85, 46, -85 },
+
+ /*o13*/
+ { -22, 61, -22, 61, -22, 61, -22, 61 },
+ { -85, 90, -85, 90, -85, 90, -85, 90 },
+ { -73, 38, -73, 38, -73, 38, -73, 38 },
+ { 4, -46, 4, -46, 4, -46, 4, -46 },
+ { 78, -90, 78, -90, 78, -90, 78, -90 },
+ { 82, -54, 82, -54, 82, -54, 82, -54 },
+ { 13, 31, 13, 31, 13, 31, 13, 31 },
+ { -67, 88, -67, 88, -67, 88, -67, 88 },
+
+ /*o14*/
+ { -13, 38, -13, 38, -13, 38, -13, 38 },
+ { -61, 78, -61, 78, -61, 78, -61, 78 },
+ { -88, 90, -88, 90, -88, 90, -88, 90 },
+ { -85, 73, -85, 73, -85, 73, -85, 73 },
+ { -54, 31, -54, 31, -54, 31, -54, 31 },
+ { -4, -22, -4, -22, -4, -22, -4, -22 },
+ { 46, -67, 46, -67, 46, -67, 46, -67 },
+ { 82, -90, 82, -90, 82, -90, 82, -90 },
+
+ /*o15*/
+ { -4, 13, -4, 13, -4, 13, -4, 13 },
+ { -22, 31, -22, 31, -22, 31, -22, 31 },
+ { -38, 46, -38, 46, -38, 46, -38, 46 },
+ { -54, 61, -54, 61, -54, 61, -54, 61 },
+ { -67, 73, -67, 73, -67, 73, -67, 73 },
+ { -78, 82, -78, 82, -78, 82, -78, 82 },
+ { -85, 88, -85, 88, -85, 88, -85, 88 },
+ { -90, 90, -90, 90, -90, 90, -90, 90 },
+
+};
+const WORD16 g_ai2_ihevc_trans_16_even[12][8] =
+{
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, -64, 64, -64, 64, -64, 64, -64 },
+ { 89, 75, 89, 75, 89, 75, 89, 75 },
+ { 75, -18, 75, -18, 75, -18, 75, -18 },
+ { 50, 18, 50, 18, 50, 18, 50, 18 },
+ { 89, 50, 89, 50, 89, 50, 89, 50 },
+ { 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 36, -83, 36, -83, 36, -83, 36, -83 },
+ { 50, -89, 50, -89, 50, -89, 50, -89 },
+ { 18, -50, 18, -50, 18, -50, 18, -50 },
+ { 18, 75, 18, 75, 18, 75, 18, 75 },
+ { 75, -89, 75, -89, 75, -89, 75, -89 },
+};
+const WORD16 g_ai2_ihevc_trans_16_odd[32][8] =
+{
+ { 90, 87, 90, 87, 90, 87, 90, 87 },
+ { 80, 70, 80, 70, 80, 70, 80, 70 },
+ { 57, 43, 57, 43, 57, 43, 57, 43 },
+ { 25, 9, 25, 9, 25, 9, 25, 9 },
+ { 87, 57, 87, 57, 87, 57, 87, 57 },
+ { 9, -43, 9, -43, 9, -43, 9, -43 },
+ { 80, 90, 80, 90, 80, 90, 80, 90 },
+ { 70, 25, 70, 25, 70, 25, 70, 25 },
+ { 80, 9, 80, 9, 80, 9, 80, 9 },
+ { 70, 87, 70, 87, 70, 87, 70, 87 },
+ { 25, -57, 25, -57, 25, -57, 25, -57 },
+ { 90, 43, 90, 43, 90, 43, 90, 43 },
+ { 70, -43, 70, -43, 70, -43, 70, -43 },
+ { 87, -9, 87, -9, 87, -9, 87, -9 },
+ { 90, 25, 90, 25, 90, 25, 90, 25 },
+ { 80, 57, 80, 57, 80, 57, 80, 57 },
+ { 57, -80, 57, -80, 57, -80, 57, -80 },
+ { 25, -90, 25, -90, 25, -90, 25, -90 },
+ { 9, 87, 9, 87, 9, 87, 9, 87 },
+ { 43, 70, 43, 70, 43, 70, 43, 70 },
+ { 43, -90, 43, -90, 43, -90, 43, -90 },
+ { 57, 25, 57, 25, 57, 25, 57, 25 },
+ { 87, -70, 87, -70, 87, -70, 87, -70 },
+ { 9, -80, 9, -80, 9, -80, 9, -80 },
+ { 25, -70, 25, -70, 25, -70, 25, -70 },
+ { 90, -80, 90, -80, 90, -80, 90, -80 },
+ { 43, 9, 43, 9, 43, 9, 43, 9 },
+ { 57, -87, 57, -87, 57, -87, 57, -87 },
+ { 9, -25, 9, -25, 9, -25, 9, -25 },
+ { 43, -57, 43, -57, 43, -57, 43, -57 },
+ { 70, -80, 70, -80, 70, -80, 70, -80 },
+ { 87, -90, 87, -90, 87, -90, 87, -90 },
+};
+const WORD16 g_ai2_ihevc_trans_intr_even_8[4][8] =
+{
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 36, -83, 36, -83, 36, -83, 36, -83 },
+ { 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 64, -64, 64, -64, 64, -64, 64, -64 }
+};
+const WORD16 g_ai2_ihevc_trans_intr_odd_8[8][8] =
+{
+ { 89, 75, 89, 75, 89, 75, 89, 75 },
+ { 50, 18, 50, 18, 50, 18, 50, 18 },
+ { 75, -18, 75, -18, 75, -18, 75, -18 },
+ { 89, 50, 89, 50, 89, 50, 89, 50 },
+ { 50, -89, 50, -89, 50, -89, 50, -89 },
+ { 18, 75, 18, 75, 18, 75, 18, 75 },
+ { 18, -50, 18, -50, 18, -50, 18, -50 },
+ { 75, -89, 75, -89, 75, -89, 75, -89 },
+};
+const WORD16 g_ai2_ihevc_trans_intr_4[4][8] =
+{
+ { 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 36, -83, 36, -83, 36, -83, 36, -83 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, -64, 64, -64, 64, -64, 64, -64 }
+};
+
+const UWORD8 IHEVCE_CHROMA_SHUFFLEMASK_HBD[8] = { 0x00, 0x01, 0x04, 0x05,
+ 0x08, 0x09, 0x0C, 0x0D };
+#ifndef DISABLE_AVX2
+const WORD32 g_ai4_ihevc_trans_8_intr_avx2[7][8] =
+{ /* 4*32 = 128 bit */
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 83, 83, 83, 83, 83, 83, 83, 83 },
+ { 36, 36, 36, 36, 36, 36, 36, 36 },
+ { 75, 75, 75, 75, 75, 75, 75, 75 },
+ { 18, 18, 18, 18, 18, 18, 18, 18 },
+ { 89, 89, 89, 89, 89, 89, 89, 89 },
+ { 50, 50, 50, 50, 50, 50, 50, 50 },
+};
+const WORD16 g_ai2_ihevc_trans_8_intr_avx2[8][16] =
+{ /* 4*32 = 128 bit */
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 89, 75, 18, 50, 89, 75, 18, 50, 89, 75, 18, 50, 89, 75, 18, 50 },
+ { 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 75, -18, -50, -89, 75, -18, -50, -89, 75, -18, -50, -89, 75, -18, -50, -89 },
+ { 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64 },
+ { 50, -89, 75, 18, 50, -89, 75, 18, 50, -89, 75, 18, 50, -89, 75, 18 },
+ { 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83 },
+ { 18, -50, -89, 75, 18, -50, -89, 75, 18, -50, -89, 75, 18, -50, -89, 75 }
+};
+
+const WORD32 g_ai2_ihevc_trans_32_intr_8_avx2[8][8] =
+{ /* 4*32 = 128 bit */
+ { 90, 90, 90, 90, 90, 90, 90, 90 },
+ { 87, 87, 87, 87, 87, 87, 87, 87 },
+ { 80, 80, 80, 80, 80, 80, 80, 80 },
+ { 70, 70, 70, 70, 70, 70, 70, 70 },
+ { 57, 57, 57, 57, 57, 57, 57, 57 },
+ { 43, 43, 43, 43, 43, 43, 43, 43 },
+ { 25, 25, 25, 25, 25, 25, 25, 25 },
+ { 9, 9, 9, 9, 9, 9, 9, 9 }
+};
+const WORD32 g_ai2_ihevc_trans_32_intr_16_avx2[15][8] =
+{ /* 4*32 = 128 bit */
+ { 90, 90, 90, 90, 90, 90, 90, 90, },
+ { 88, 88, 88, 88, 88, 88, 88, 88, },
+ { 85, 85, 85, 85, 85, 85, 85, 85, },
+ { 82, 82, 82, 82, 82, 82, 82, 82, },
+ { 78, 78, 78, 78, 78, 78, 78, 78, },
+ { 73, 73, 73, 73, 73, 73, 73, 73, },
+ { 67, 67, 67, 67, 67, 67, 67, 67, },
+ { 61, 61, 61, 61, 61, 61, 61, 61, },
+ { 54, 54, 54, 54, 54, 54, 54, 54, },
+ { 46, 46, 46, 46, 46, 46, 46, 46, },
+ { 38, 38, 38, 38, 38, 38, 38, 38, },
+ { 31, 31, 31, 31, 31, 31, 31, 31, },
+ { 22, 22, 22, 22, 22, 22, 22, 22, },
+ { 13, 13, 13, 13, 13, 13, 13, 13, },
+ { 4, 4, 4, 4, 4, 4, 4, 4, }
+};
+const WORD16 g_ai2_ihevc_trans_16_intr_odd_avx2[32][16] =
+{
+ { 90, 87, 90, 87, 90, 87, 90, 87, 90, 87, 90, 87, 90, 87, 90, 87 },
+ { 70, 80, 70, 80, 70, 80, 70, 80, 70, 80, 70, 80, 70, 80, 70, 80 },
+ { 57, 43, 57, 43, 57, 43, 57, 43, 57, 43, 57, 43, 57, 43, 57, 43 },
+ { 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25 },
+ { 87, 57, 87, 57, 87, 57, 87, 57, 87, 57, 87, 57, 87, 57, 87, 57 },
+ { -43, 9, -43, 9, -43, 9, -43, 9, -43, 9, -43, 9, -43, 9, -43, 9 },
+ { -80, -90, -80, -90, -80, -90, -80, -90, -80, -90, -80, -90, -80, -90, -80, -90 },
+ { -25, -70, -25, -70, -25, -70, -25, -70, -25, -70, -25, -70, -25, -70, -25, -70 },
+ { 80, 9, 80, 9, 80, 9, 80, 9, 80, 9, 80, 9, 80, 9, 80, 9 },
+ { -87, -70, -87, -70, -87, -70, -87, -70, -87, -70, -87, -70, -87, -70, -87, -70 },
+ { -25, 57, -25, 57, -25, 57, -25, 57, -25, 57, -25, 57, -25, 57, -25, 57 },
+ { 43, 90, 43, 90, 43, 90, 43, 90, 43, 90, 43, 90, 43, 90, 43, 90 },
+ { 70, -43, 70, -43, 70, -43, 70, -43, 70, -43, 70, -43, 70, -43, 70, -43 },
+ { 9, -87, 9, -87, 9, -87, 9, -87, 9, -87, 9, -87, 9, -87, 9, -87 },
+ { 90, 25, 90, 25, 90, 25, 90, 25, 90, 25, 90, 25, 90, 25, 90, 25 },
+ { -57, -80, -57, -80, -57, -80, -57, -80, -57, -80, -57, -80, -57, -80, -57, -80 },
+ { 57, -80, 57, -80, 57, -80, 57, -80, 57, -80, 57, -80, 57, -80, 57, -80 },
+ { 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25 },
+ { -9, -87, -9, -87, -9, -87, -9, -87, -9, -87, -9, -87, -9, -87, -9, -87 },
+ { 70, 43, 70, 43, 70, 43, 70, 43, 70, 43, 70, 43, 70, 43, 70, 43 },
+ { 43, -90, 43, -90, 43, -90, 43, -90, 43, -90, 43, -90, 43, -90, 43, -90 },
+ { 25, 57, 25, 57, 25, 57, 25, 57, 25, 57, 25, 57, 25, 57, 25, 57 },
+ { -87, 70, -87, 70, -87, 70, -87, 70, -87, 70, -87, 70, -87, 70, -87, 70 },
+ { -80, 9, -80, 9, -80, 9, -80, 9, -80, 9, -80, 9, -80, 9, -80, 9 },
+ { 25, -70, 25, -70, 25, -70, 25, -70, 25, -70, 25, -70, 25, -70, 25, -70 },
+ { -80, 90, -80, 90, -80, 90, -80, 90, -80, 90, -80, 90, -80, 90, -80, 90 },
+ { 43, 9, 43, 9, 43, 9, 43, 9, 43, 9, 43, 9, 43, 9, 43, 9 },
+ { 87, -57, 87, -57, 87, -57, 87, -57, 87, -57, 87, -57, 87, -57, 87, -57 },
+ { 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25 },
+ { -57, 43, -57, 43, -57, 43, -57, 43, -57, 43, -57, 43, -57, 43, -57, 43 },
+ { 70, -80, 70, -80, 70, -80, 70, -80, 70, -80, 70, -80, 70, -80, 70, -80 },
+ { -90, 87, -90, 87, -90, 87, -90, 87, -90, 87, -90, 87, -90, 87, -90, 87 }
+};
+
+const WORD16 g_ai2_ihevc_trans_16_intr_even_avx2[12][16] =
+{
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75 },
+ { 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50 },
+ { 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18 },
+ { -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89 },
+ { 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64 },
+ { 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89 },
+ { 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18 },
+ { 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83 },
+ { 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50 },
+ { -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75 }
+};
+
+
+#endif
diff --git a/common/ihevc_trans_tables.h b/common/ihevc_trans_tables.h
new file mode 100644
index 0000000..7295967
--- /dev/null
+++ b/common/ihevc_trans_tables.h
@@ -0,0 +1,116 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_trans_tables.h
+*
+* @brief
+* Tables for forward and inverse transform
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_TRANS_TABLES_H_
+#define _IHEVC_TRANS_TABLES_H_
+
+
+#include "ihevc_defs.h"
+
+extern const WORD32 g_ihevc_iquant_scales[6];
+
+extern const WORD16 g_ihevc_iquant_intr_scales[6][8];
+
+extern const WORD32 g_ihevc_quant_scales[6];
+
+extern const WORD16 g_ai2_ihevc_trans_dst_4[4][4];
+
+extern const WORD16 g_ai2_ihevc_trans_4[4][4];
+
+extern const WORD16 g_ai2_ihevc_trans_4_transpose[4][4];
+
+extern const WORD16 g_ai2_ihevc_trans_8[8][8];
+
+extern const WORD16 g_ai2_ihevc_trans_16[16][16];
+extern const WORD16 g_ai2_ihevc_trans_16_transpose[1][16];
+extern const WORD16 g_ai2_ihevc_trans_32_transpose[1][32];
+extern const WORD16 g_ai2_ihevc_trans_32[32][32];
+
+
+extern const WORD32 g_ai4_ihevc_trans_dst_intr_4[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_4_intr[3][4];
+extern const WORD16 g_ai2_ihevc_trans_4_intr[8];
+
+extern const WORD32 g_ai4_ihevc_trans_8_intr[7][4];
+extern const WORD16 g_ai2_ihevc_trans_8_intr[8][8];
+
+
+extern const WORD32 g_ai4_ihevc_trans_4_ttype1[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_4_ttype0[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_intr_even_8[3][4];
+
+extern const WORD32 g_ai4_ihevc_trans_intr_odd_8[4][4];
+
+extern const WORD32 g_ai4_ihevc_trans_16_even[7][4];
+
+extern const WORD32 g_ai4_ihevc_trans_16_odd[8][4];
+
+extern const WORD32 g_ai2_ihevc_trans_32_intr_8[8][4];
+extern const WORD32 g_ai2_ihevc_trans_32_intr_16[15][4];
+
+extern const WORD16 g_ai2_ihevc_trans_16_intr_even[12][8];
+
+extern const WORD16 g_ai2_ihevc_trans_16_intr_odd[32][8];
+
+
+extern const WORD16 g_ai2_ihevc_trans_32_intr_odd[32][16];
+
+extern const WORD16 g_ai2_ihevc_trans_32_intr_even[22][8];
+
+#ifndef DISABLE_AVX2
+extern const WORD16 g_ai2_ihevc_trans_8_intr_avx2[8][16];
+extern const WORD32 g_ai4_ihevc_trans_8_intr_avx2[7][8];
+extern const WORD16 g_ai2_ihevc_trans_16_intr_odd_avx2[32][16];
+extern const WORD16 g_ai2_ihevc_trans_16_intr_even_avx2[12][16];
+extern const WORD32 g_ai2_ihevc_trans_32_intr_8_avx2[8][8];
+extern const WORD32 g_ai2_ihevc_trans_32_intr_16_avx2[15][8];
+#endif
+
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_16_even_packed[12][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_32_intr_packed[32][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_32_intr_odd_packed[128][8];
+
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_16_even[12][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_16_odd[32][8];
+
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_intr_even_8[4][8];
+extern MEM_ALIGN16 const WORD16 g_ai2_ihevc_trans_intr_odd_8[8][8];
+
+extern const WORD16 g_ai2_ihevc_trans_intr_4[4][8];
+
+extern const UWORD8 IHEVCE_CHROMA_SHUFFLEMASK_HBD[8];
+
+#endif /*_IHEVC_TRANS_TABLES_H_*/
diff --git a/common/ihevc_typedefs.h b/common/ihevc_typedefs.h
new file mode 100644
index 0000000..47a7a2f
--- /dev/null
+++ b/common/ihevc_typedefs.h
@@ -0,0 +1,65 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* typedefs.h
+*
+* @brief
+* Type definitions used in the code
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVC_TYPEDEFS_H_
+#define _IHEVC_TYPEDEFS_H_
+
+
+typedef unsigned char UWORD8;
+typedef unsigned short UWORD16;
+typedef unsigned int UWORD32;
+
+typedef signed char WORD8;
+typedef signed short WORD16;
+typedef signed int WORD32;
+
+typedef char CHAR;
+
+typedef double DOUBLE;
+
+
+
+
+#ifndef MSVC
+
+typedef unsigned long long ULWORD64;
+typedef signed long long LWORD64;
+
+#else
+typedef unsigned __int64 ULWORD64;
+typedef __int64 LWORD64;
+
+
+#endif
+#endif /* _IHEVC_TYPEDEFS_H_ */
diff --git a/common/ihevc_weighted_pred.c b/common/ihevc_weighted_pred.c
new file mode 100644
index 0000000..a806293
--- /dev/null
+++ b/common/ihevc_weighted_pred.c
@@ -0,0 +1,604 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_weighted_pred.c
+*
+* @brief
+* Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_weighted_pred_uni()
+* - ihevc_weighted_pred_bi()
+* - ihevc_weighted_pred_bi_default()
+* - ihevc_weighted_pred_chroma_uni()
+* - ihevc_weighted_pred_chroma_bi()
+* - ihevc_weighted_pred_chroma_bi_default()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+
+#include "ihevc_inter_pred.h"
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does uni-weighted prediction on the array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i4_tmp = (pi2_src[col] + lvl_shift) * wgt0;
+ i4_tmp += 1 << (shift - 1);
+ i4_tmp = (i4_tmp >> shift) + off0;
+
+ pu1_dst[col] = CLIP_U8(i4_tmp);
+ }
+
+ pi2_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+//WEIGHTED_PRED_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_uni(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col += 2)
+ {
+ i4_tmp = (pi2_src[col] + lvl_shift) * wgt0_cb;
+ i4_tmp += 1 << (shift - 1);
+ i4_tmp = (i4_tmp >> shift) + off0_cb;
+
+ pu1_dst[col] = CLIP_U8(i4_tmp);
+
+ i4_tmp = (pi2_src[col + 1] + lvl_shift) * wgt0_cr;
+ i4_tmp += 1 << (shift - 1);
+ i4_tmp = (i4_tmp >> shift) + off0_cr;
+
+ pu1_dst[col + 1] = CLIP_U8(i4_tmp);
+ }
+
+ pi2_src += src_strd;
+ pu1_dst += dst_strd;
+ }
+}
+//WEIGHTED_PRED_CHROMA_UNI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 wgt1,
+ WORD32 off1,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0;
+ i4_tmp += (pi2_src2[col] + lvl_shift2) * wgt1;
+ i4_tmp += (off0 + off1 + 1) << (shift - 1);
+
+ pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+ }
+
+ pi2_src1 += src_strd1;
+ pi2_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
+//WEIGHTED_PRED_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 wgt1_cb,
+ WORD32 wgt1_cr,
+ WORD32 off1_cb,
+ WORD32 off1_cr,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD32 i4_tmp;
+
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col += 2)
+ {
+ i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0_cb;
+ i4_tmp += (pi2_src2[col] + lvl_shift2) * wgt1_cb;
+ i4_tmp += (off0_cb + off1_cb + 1) << (shift - 1);
+
+ pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+
+ i4_tmp = (pi2_src1[col + 1] + lvl_shift1) * wgt0_cr;
+ i4_tmp += (pi2_src2[col + 1] + lvl_shift2) * wgt1_cr;
+ i4_tmp += (off0_cr + off1_cr + 1) << (shift - 1);
+
+ pu1_dst[col + 1] = CLIP_U8(i4_tmp >> shift);
+ }
+
+ pi2_src1 += src_strd1;
+ pi2_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
+//WEIGHTED_PRED_CHROMA_BI
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+* >> shift where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_default(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD32 i4_tmp;
+ WORD32 shift;
+
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < wd; col++)
+ {
+ i4_tmp = pi2_src1[col] + lvl_shift1;
+ i4_tmp += pi2_src2[col] + lvl_shift2;
+ i4_tmp += 1 << (shift - 1);
+
+ pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+ }
+
+ pi2_src1 += src_strd1;
+ pi2_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
+//WEIGHTED_PRED_BI_DEFAULT
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+* >> shift where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_default(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD32 i4_tmp;
+ WORD32 shift;
+
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ for(row = 0; row < ht; row++)
+ {
+ for(col = 0; col < 2 * wd; col++)
+ {
+ i4_tmp = pi2_src1[col] + lvl_shift1;
+ i4_tmp += pi2_src2[col] + lvl_shift2;
+ i4_tmp += 1 << (shift - 1);
+
+ pu1_dst[col] = CLIP_U8(i4_tmp >> shift);
+ }
+
+ pi2_src1 += src_strd1;
+ pi2_src2 += src_strd2;
+ pu1_dst += dst_strd;
+ }
+}
+//WEIGHTED_PRED_CHROMA_BI_DEFAULT
diff --git a/common/ihevc_weighted_pred.h b/common/ihevc_weighted_pred.h
new file mode 100644
index 0000000..aaf9797
--- /dev/null
+++ b/common/ihevc_weighted_pred.h
@@ -0,0 +1,178 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_weighted_pred.h
+*
+* @brief
+* Function declarations used for buffer management
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVC_WEIGHTED_PRED_H_
+#define IHEVC_WEIGHTED_PRED_H_
+
+typedef void ihevc_weighted_pred_uni_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_weighted_pred_chroma_uni_ft(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_weighted_pred_bi_ft(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 wgt1,
+ WORD32 off1,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_weighted_pred_chroma_bi_ft(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 wgt1_cb,
+ WORD32 wgt1_cr,
+ WORD32 off1_cb,
+ WORD32 off1_cr,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_weighted_pred_bi_default_ft(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd);
+
+typedef void ihevc_weighted_pred_chroma_bi_default_ft(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd);
+/* C function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default;
+
+/* A9 Q function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_a9q;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_a9q;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_a9q;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_a9q;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_a9q;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_a9q;
+
+/* A9 A function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_a9a;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_a9a;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_a9a;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_a9a;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_a9a;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_a9a;
+
+/* NEONINTR function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_neonintr;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_neonintr;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_neonintr;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_neonintr;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_neonintr;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_neonintr;
+/* SSSE3 function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_ssse3;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_ssse3;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_ssse3;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_ssse3;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_ssse3;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_ssse3;
+
+/* SSE42 function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_sse42;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_sse42;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_sse42;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_sse42;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_sse42;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_sse42;
+
+/* AVX2 function declarations */
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_avx2;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_avx2;
+
+/* armv8 function declarations */
+ihevc_weighted_pred_uni_ft ihevc_weighted_pred_uni_av8;
+ihevc_weighted_pred_chroma_uni_ft ihevc_weighted_pred_chroma_uni_av8;
+ihevc_weighted_pred_bi_ft ihevc_weighted_pred_bi_av8;
+ihevc_weighted_pred_chroma_bi_ft ihevc_weighted_pred_chroma_bi_av8;
+ihevc_weighted_pred_bi_default_ft ihevc_weighted_pred_bi_default_av8;
+ihevc_weighted_pred_chroma_bi_default_ft ihevc_weighted_pred_chroma_bi_default_av8;
+
+#endif /* IHEVC_WEIGHTED_PRED_H_ */
diff --git a/common/ithread.c b/common/ithread.c
new file mode 100644
index 0000000..232ecfa
--- /dev/null
+++ b/common/ithread.c
@@ -0,0 +1,454 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/* */
+/* File Name : ithread.c */
+/* */
+/* Description : Contains abstraction for threads, mutex and semaphores*/
+/* */
+/* List of Functions : */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 Harish Initial Version */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ithread.h"
+#include <sys/types.h>
+
+#ifndef X86_MSVC
+//#define PTHREAD_AFFINITY
+//#define SYSCALL_AFFINITY
+
+#ifdef PTHREAD_AFFINITY
+#define _GNU_SOURCE
+#define __USE_GNU
+#endif
+
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <unistd.h>
+
+
+#endif
+
+
+#if 0
+#include <sys/syscall.h>
+#endif
+
+
+#ifdef X86_MSVC
+
+#include <windows.h>
+#define SEM_MAX_COUNT 100
+#define SEM_INCREMENT_COUNT 1
+
+UWORD32 ithread_get_handle_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+
+ if(0 == thread_handle)
+ return -1;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = (void *)CreateThread
+ (NULL, /* Attributes */
+ 1024 * 128, /* Stack size */
+ (LPTHREAD_START_ROUTINE)strt, /* Thread function */
+ argument, /* Parameters */
+ 0, /* Creation flags */
+ NULL); /* Thread ID */
+ *ppv_thread_handle = (HANDLE)thread_handle_value;
+
+ return 0;
+}
+
+WORD32 ithread_join(void *thread_handle, void **val_ptr)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+
+ if(0 == thread_handle)
+ return -1;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = *ppv_thread_handle;
+
+ if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE))
+ {
+ CloseHandle(thread_handle_value);
+ }
+
+ return 0;
+}
+
+void ithread_exit(void *thread_handle)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+ DWORD thread_exit_code;
+
+ if(0 == thread_handle)
+ return;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = *ppv_thread_handle;
+ /* Get exit code for thread. If the return value is 0, means thread is busy */
+ if(0 != GetExitCodeThread(thread_handle_value, &thread_exit_code))
+ {
+ TerminateThread(thread_handle_value, thread_exit_code);
+ }
+
+ return;
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_mutex_init(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL);
+ *ppv_mutex_handle = mutex_handle_value;
+ return 0;
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ CloseHandle(mutex_handle_value);
+ return 0;
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+ DWORD result = 0;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ result = WaitForSingleObject(mutex_handle_value, INFINITE);
+
+ if(WAIT_OBJECT_0 == result)
+ return 0;
+
+ return 1;
+
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+ DWORD result = 0;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ result = ReleaseSemaphore(mutex_handle_value, 1, NULL);
+
+ if(0 == result)
+ return -1;
+
+ return 0;
+}
+
+void ithread_yield(void) { }
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+ UWORD32 u4_time_ms = u4_time_us / 1000;
+ Sleep(u4_time_ms);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+ Sleep(u4_time_ms);
+}
+
+
+void ithread_sleep(UWORD32 u4_time)
+{
+ UWORD32 u4_time_ms = u4_time * 1000;
+ Sleep(u4_time_ms);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_sem_init(void *sem, WORD32 pshared, UWORD32 value)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = CreateSemaphore(NULL, /* Security Attribute*/
+ value, /* Initial count */
+ SEM_MAX_COUNT, /* Max value */
+ NULL); /* Name, not used */
+ *sem_handle = sem_handle_value;
+ return 0;
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ /* Post on Semaphore by releasing the lock on mutex */
+ if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL))
+ return 0;
+
+ return -1;
+}
+
+WORD32 ithread_sem_wait(void *sem)
+{
+ DWORD result = 0;
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ /* Wait on Semaphore object infinitly */
+ result = WaitForSingleObject(sem_handle_value, INFINITE);
+
+ /* If lock on semaphore is acquired, return SUCCESS */
+ if(WAIT_OBJECT_0 == result)
+ return 0;
+
+ /* If call timeouts, return FAILURE */
+ if(WAIT_TIMEOUT == result)
+ return -1;
+
+ return 0;
+}
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ if(FALSE == CloseHandle(sem_handle_value))
+ {
+ return -1;
+ }
+ return 0;
+}
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+ return 1;
+}
+
+#else
+UWORD32 ithread_get_handle_size(void)
+{
+ return sizeof(pthread_t);
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+ return sizeof(pthread_mutex_t);
+}
+
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+ return pthread_create((pthread_t *)thread_handle, attribute, (void * (*)(void *))strt, argument);
+}
+
+WORD32 ithread_join(void *thread_handle, void **val_ptr)
+{
+ pthread_t *pthread_handle = (pthread_t *)thread_handle;
+ return pthread_join(*pthread_handle, val_ptr);
+}
+
+void ithread_exit(void *val_ptr)
+{
+ return pthread_exit(val_ptr);
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+ return (sizeof(pthread_mutex_t));
+}
+WORD32 ithread_mutex_init(void *mutex)
+{
+ return pthread_mutex_init((pthread_mutex_t *)mutex, NULL);
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+ return pthread_mutex_destroy((pthread_mutex_t *)mutex);
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+ return pthread_mutex_lock((pthread_mutex_t *)mutex);
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+ return pthread_mutex_unlock((pthread_mutex_t *)mutex);
+}
+
+void ithread_yield(void)
+{
+ sched_yield();
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+ usleep(u4_time * 1000 * 1000);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+ usleep(u4_time_ms * 1000);
+}
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+ usleep(u4_time_us);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+ return (sizeof(sem_t));
+}
+
+
+WORD32 ithread_sem_init(void *sem, WORD32 pshared, UWORD32 value)
+{
+ return sem_init((sem_t *)sem, pshared, value);
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+ return sem_post((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_wait(void *sem)
+{
+ return sem_wait((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+ return sem_destroy((sem_t *)sem);
+}
+
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+
+#ifdef PTHREAD_AFFINITY
+ cpu_set_t cpuset;
+ int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+ pthread_t cur_thread = pthread_self();
+
+ if(core_id >= num_cores)
+ return -1;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(core_id, &cpuset);
+
+ return pthread_setaffinity_np(cur_thread, sizeof(cpu_set_t), &cpuset);
+
+#elif SYSCALL_AFFINITY
+ WORD32 i4_sys_res;
+
+ pid_t pid = gettid();
+
+
+ i4_sys_res = syscall(__NR_sched_setaffinity, pid, sizeof(i4_mask), &i4_mask);
+ if(i4_sys_res)
+ {
+ //WORD32 err;
+ //err = errno;
+ //perror("Error in setaffinity syscall PERROR : ");
+ //LOG_ERROR("Error in the syscall setaffinity: mask=0x%x err=0x%x", i4_mask, i4_sys_res);
+ return -1;
+ }
+#endif
+
+ return core_id;
+
+}
+#endif
diff --git a/common/ithread.h b/common/ithread.h
new file mode 100644
index 0000000..f435e78
--- /dev/null
+++ b/common/ithread.h
@@ -0,0 +1,78 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ithread.h
+*
+* @brief
+* This file contains all the necessary structure and enumeration
+* definitions needed for the Application Program Interface(API) of the
+* Thread Abstraction Layer
+*
+* @author
+* Harish
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef __ITHREAD_H__
+#define __ITHREAD_H__
+
+UWORD32 ithread_get_handle_size(void);
+
+UWORD32 ithread_get_mutex_lock_size(void);
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument);
+
+void ithread_exit(void *val_ptr);
+
+WORD32 ithread_join(void *thread_id, void **val_ptr);
+
+WORD32 ithread_get_mutex_struct_size(void);
+
+WORD32 ithread_mutex_init(void *mutex);
+
+WORD32 ithread_mutex_destroy(void *mutex);
+
+WORD32 ithread_mutex_lock(void *mutex);
+
+WORD32 ithread_mutex_unlock(void *mutex);
+
+void ithread_yield(void);
+
+void ithread_sleep(UWORD32 u4_time);
+
+void ithread_msleep(UWORD32 u4_time_ms);
+
+void ithread_usleep(UWORD32 u4_time_us);
+
+UWORD32 ithread_get_sem_struct_size(void);
+
+WORD32 ithread_sem_init(void *sem, WORD32 pshared, UWORD32 value);
+
+WORD32 ithread_sem_post(void *sem);
+
+WORD32 ithread_sem_wait(void *sem);
+
+WORD32 ithread_sem_destroy(void *sem);
+
+WORD32 ithread_set_affinity(WORD32 core_id);
+#endif /* __ITHREAD_H__ */
diff --git a/common/iv.h b/common/iv.h
new file mode 100644
index 0000000..a60cf47
--- /dev/null
+++ b/common/iv.h
@@ -0,0 +1,418 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* iv.h
+*
+* @brief
+* This file contains all the necessary structure and enumeration
+* definitions needed for the Application Program Interface(API) of the
+* Ittiam Video and Image codecs
+*
+* @author
+* 100239(RCY)
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IV_H
+#define _IV_H
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Typedefs */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+
+
+/* IV_API_CALL_STATUS_T:This is only to return the FAIL/PASS status to the */
+/* application for the current API call */
+
+typedef enum {
+ IV_STATUS_NA = 0x7FFFFFFF,
+ IV_SUCCESS = 0x0,
+ IV_FAIL = 0x1,
+}IV_API_CALL_STATUS_T;
+
+/* IV_MEM_TYPE_T: This Enumeration defines the type of memory (Internal/Ext */
+/* -ernal) along with the cacheable/non-cacheable attributes */
+
+typedef enum {
+ IV_NA_MEM_TYPE = 0x7FFFFFFF,
+ IV_INTERNAL_CACHEABLE_PERSISTENT_MEM = 0x1,
+ IV_INTERNAL_CACHEABLE_SCRATCH_MEM = 0x2,
+ IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM = 0x3,
+ IV_EXTERNAL_CACHEABLE_SCRATCH_MEM = 0x4,
+ IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x5,
+ IV_INTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x6,
+ IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x7,
+ IV_EXTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x8
+}IV_MEM_TYPE_T;
+
+/* IV_COLOR_FORMAT_T: This enumeration lists all the color formats which */
+/* finds usage in video/image codecs */
+
+typedef enum {
+ IV_CHROMA_NA = 0x7FFFFFFF,
+ IV_YUV_420P = 0x1,
+ IV_YUV_422P = 0x2,
+ IV_420_UV_INTL = 0x3,
+ IV_YUV_422IBE = 0x4,
+ IV_YUV_422ILE = 0x5,
+ IV_YUV_444P = 0x6,
+ IV_YUV_411P = 0x7,
+ IV_GRAY = 0x8,
+ IV_RGB_565 = 0x9,
+ IV_RGB_24 = 0xa,
+ IV_YUV_420SP_UV = 0xb,
+ IV_YUV_420SP_VU = 0xc,
+ IV_RGBA_8888 = 0xd
+}IV_COLOR_FORMAT_T;
+
+/* IV_PICTURE_CODING_TYPE_T: VOP/Frame coding type Enumeration */
+
+typedef enum {
+ IV_NA_FRAME = 0x7FFFFFFF,
+ IV_I_FRAME = 0x0,
+ IV_P_FRAME = 0x1,
+ IV_B_FRAME = 0x2,
+ IV_IDR_FRAME = 0x3,
+ IV_II_FRAME = 0x4,
+ IV_IP_FRAME = 0x5,
+ IV_IB_FRAME = 0x6,
+ IV_PI_FRAME = 0x7,
+ IV_PP_FRAME = 0x8,
+ IV_PB_FRAME = 0x9,
+ IV_BI_FRAME = 0xa,
+ IV_BP_FRAME = 0xb,
+ IV_BB_FRAME = 0xc,
+ IV_MBAFF_I_FRAME = 0xd,
+ IV_MBAFF_P_FRAME = 0xe,
+ IV_MBAFF_B_FRAME = 0xf,
+ IV_MBAFF_IDR_FRAME = 0x10,
+ IV_NOT_CODED_FRAME = 0x11,
+ IV_FRAMETYPE_DEFAULT = IV_I_FRAME
+}IV_PICTURE_CODING_TYPE_T;
+
+/* IV_FLD_TYPE_T: field type Enumeration */
+
+typedef enum {
+ IV_NA_FLD = 0x7FFFFFFF,
+ IV_TOP_FLD = 0x0,
+ IV_BOT_FLD = 0x1,
+ IV_FLD_TYPE_DEFAULT = IV_TOP_FLD
+}IV_FLD_TYPE_T;
+
+/* IV_CONTENT_TYPE_T: Video content type */
+
+typedef enum {
+ IV_CONTENTTYPE_NA = 0x7FFFFFFF,
+ IV_PROGRESSIVE = 0x0,
+ IV_INTERLACED = 0x1,
+ IV_PROGRESSIVE_FRAME = 0x2,
+ IV_INTERLACED_FRAME = 0x3,
+ IV_INTERLACED_TOPFIELD = 0x4,
+ IV_INTERLACED_BOTTOMFIELD = 0x5,
+ IV_CONTENTTYPE_DEFAULT = IV_PROGRESSIVE,
+}IV_CONTENT_TYPE_T;
+
+/* IV_API_COMMAND_TYPE_T:API command type */
+typedef enum {
+ IV_CMD_NA = 0x7FFFFFFF,
+ IV_CMD_GET_NUM_MEM_REC = 0x0,
+ IV_CMD_FILL_NUM_MEM_REC = 0x1,
+ IV_CMD_RETRIEVE_MEMREC = 0x2,
+ IV_CMD_INIT = 0x3,
+ IV_CMD_DUMMY_ELEMENT = 0x4,
+}IV_API_COMMAND_TYPE_T;
+
+/*****************************************************************************/
+/* Structure */
+/*****************************************************************************/
+
+/* IV_OBJ_T: This structure defines the handle for the codec instance */
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * Pointer to the API function pointer table of the codec
+ */
+ void *pv_fxns;
+
+ /**
+ * Pointer to the handle of the codec
+ */
+ void *pv_codec_handle;
+}iv_obj_t;
+
+/* iv_mem_rec_t: This structure defines the memory record holder which will */
+/* be used by the codec to communicate its memory requirements to the */
+/* application through appropriate API functions */
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * Pointer to the memory allocated by the application
+ */
+ void *pv_base;
+
+ /**
+ * u4_size of the memory to be allocated
+ */
+ UWORD32 u4_mem_size;
+
+ /**
+ * Alignment of the memory pointer
+ */
+ UWORD32 u4_mem_alignment;
+ /**
+ * Nature of the memory to be allocated
+ */
+ IV_MEM_TYPE_T e_mem_type;
+}iv_mem_rec_t;
+
+/* IV_YUV_BUF_T: This structure defines attributes for the yuv buffer */
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * Pointer to Luma (Y) Buffer
+ */
+
+ void *pv_y_buf;
+ /**
+ * Pointer to Chroma (Cb) Buffer
+ */
+ void *pv_u_buf;
+
+ /**
+ * Pointer to Chroma (Cr) Buffer
+ */
+ void *pv_v_buf;
+
+ /**
+ * Width of the Luma (Y) Buffer
+ */
+ UWORD32 u4_y_wd;
+
+ /**
+ * Height of the Luma (Y) Buffer
+ */
+ UWORD32 u4_y_ht;
+
+ /**
+ * Stride/Pitch of the Luma (Y) Buffer
+ */
+ UWORD32 u4_y_strd;
+
+ /**
+ * Width of the Chroma (Cb) Buffer
+ */
+ UWORD32 u4_u_wd;
+
+ /**
+ * Height of the Chroma (Cb) Buffer
+ */
+ UWORD32 u4_u_ht;
+
+ /**
+ * Stride/Pitch of the Chroma (Cb) Buffer
+ */
+ UWORD32 u4_u_strd;
+
+ /**
+ * Width of the Chroma (Cr) Buffer
+ */
+ UWORD32 u4_v_wd;
+
+ /**
+ * Height of the Chroma (Cr) Buffer
+ */
+ UWORD32 u4_v_ht;
+
+ /**
+ * Stride/Pitch of the Chroma (Cr) Buffer
+ */
+ UWORD32 u4_v_strd;
+}iv_yuv_buf_t;
+
+/*****************************************************************************/
+/* Get Number of Memory Records */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_GET_NUM_MEM_REC */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IV_API_COMMAND_TYPE_T e_cmd;
+}iv_num_mem_rec_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * num_mem_rec
+ */
+ UWORD32 u4_num_mem_rec;
+}iv_num_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/* Fill Memory Records */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_FILL_NUM_MEM_REC */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IV_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * pointer to array of memrecords structures should be filled by codec
+ with details of memory resource requirements
+ */
+ iv_mem_rec_t *pv_mem_rec_location;
+
+ /**
+ * maximum width for which codec should request memory requirements
+ */
+ UWORD32 u4_max_frm_wd;
+
+ /**
+ * maximum height for which codec should request memory requirements
+ */
+ UWORD32 u4_max_frm_ht;
+}iv_fill_mem_rec_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * no of memory record structures which are filled by codec
+ */
+ UWORD32 u4_num_mem_rec_filled;
+}iv_fill_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/* Retrieve Memory Records */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_RETRIEVE_MEMREC */
+
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IV_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * array of structures where codec should fill with all resources(memory) with it
+ */
+ iv_mem_rec_t *pv_mem_rec_location;
+}iv_retrieve_mem_rec_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * no of memory records filled by codec
+ */
+ UWORD32 u4_num_mem_rec_filled;
+}iv_retrieve_mem_rec_op_t;
+
+
+
+#endif /* _IV_H */
+
diff --git a/common/ivd.h b/common/ivd.h
new file mode 100644
index 0000000..812da18
--- /dev/null
+++ b/common/ivd.h
@@ -0,0 +1,946 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ivd.h
+*
+* @brief
+* This file contains all the necessary structure and enumeration
+* definitions needed for the Application Program Interface(API) of the
+* Ittiam Video Decoders
+*
+* @author
+* 100239(RCY)
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IVD_H
+#define _IVD_H
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+#define IVD_VIDDEC_MAX_IO_BUFFERS 64
+/*****************************************************************************/
+/* Typedefs */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+
+/* IVD_ARCH_T: Architecture Enumeration */
+typedef enum
+{
+ ARCH_NA = 0x7FFFFFFF,
+ ARCH_ARM_NONEON = 0x0,
+ ARCH_ARM_A9Q,
+ ARCH_ARM_A9A,
+ ARCH_ARM_A9,
+ ARCH_ARM_A7,
+ ARCH_ARM_A5,
+ ARCH_ARM_A15,
+ ARCH_ARM_NEONINTR,
+ ARCH_ARMV8_GENERIC,
+ ARCH_X86_GENERIC = 0x100,
+ ARCH_X86_SSSE3,
+ ARCH_X86_SSE42,
+ ARCH_X86_AVX2,
+ ARCH_MIPS_GENERIC = 0x200,
+ ARCH_MIPS_32
+}IVD_ARCH_T;
+
+/* IVD_SOC_T: SOC Enumeration */
+typedef enum
+{
+ SOC_NA = 0x7FFFFFFF,
+ SOC_GENERIC = 0x0,
+ SOC_HISI_37X = 0x100,
+}IVD_SOC_T;
+
+/* IVD_FRAME_SKIP_MODE_T:Skip mode Enumeration */
+
+typedef enum {
+ IVD_SKIP_NONE = 0x7FFFFFFF,
+ IVD_SKIP_P = 0x1,
+ IVD_SKIP_B = 0x2,
+ IVD_SKIP_I = 0x3,
+ IVD_SKIP_IP = 0x4,
+ IVD_SKIP_IB = 0x5,
+ IVD_SKIP_PB = 0x6,
+ IVD_SKIP_IPB = 0x7,
+ IVD_SKIP_IDR = 0x8,
+ IVD_SKIP_DEFAULT = IVD_SKIP_NONE,
+}IVD_FRAME_SKIP_MODE_T;
+
+/* IVD_VIDEO_DECODE_MODE_T: Set decoder to decode either frame worth of data */
+/* or only header worth of data */
+
+typedef enum {
+ IVD_DECODE_MODE_NA = 0x7FFFFFFF,
+
+ /* This enables the codec to process all decodable units */
+ IVD_DECODE_FRAME = 0x0,
+
+ /* This enables the codec to decode header only */
+ IVD_DECODE_HEADER = 0x1,
+
+
+
+}IVD_VIDEO_DECODE_MODE_T;
+
+
+/* IVD_DISPLAY_FRAME_OUT_MODE_T: Video Display Frame Output Mode */
+
+typedef enum {
+
+ IVD_DISPLAY_ORDER_NA = 0x7FFFFFFF,
+ /* To set codec to fill output buffers in display order */
+ IVD_DISPLAY_FRAME_OUT = 0x0,
+
+ /* To set codec to fill output buffers in decode order */
+ IVD_DECODE_FRAME_OUT = 0x1,
+}IVD_DISPLAY_FRAME_OUT_MODE_T;
+
+
+/* IVD_API_COMMAND_TYPE_T:API command type */
+typedef enum {
+ IVD_CMD_VIDEO_NA = 0x7FFFFFFF,
+ IVD_CMD_VIDEO_CTL = IV_CMD_DUMMY_ELEMENT + 1,
+ IVD_CMD_VIDEO_DECODE,
+ IVD_CMD_GET_DISPLAY_FRAME,
+ IVD_CMD_REL_DISPLAY_FRAME,
+ IVD_CMD_SET_DISPLAY_FRAME
+}IVD_API_COMMAND_TYPE_T;
+
+/* IVD_CONTROL_API_COMMAND_TYPE_T: Video Control API command type */
+
+typedef enum {
+ IVD_CMD_NA = 0x7FFFFFFF,
+ IVD_CMD_CTL_GETPARAMS = 0x0,
+ IVD_CMD_CTL_SETPARAMS = 0x1,
+ IVD_CMD_CTL_RESET = 0x2,
+ IVD_CMD_CTL_SETDEFAULT = 0x3,
+ IVD_CMD_CTL_FLUSH = 0x4,
+ IVD_CMD_CTL_GETBUFINFO = 0x5,
+ IVD_CMD_CTL_GETVERSION = 0x6,
+ IVD_CMD_CTL_CODEC_SUBCMD_START = 0x7
+}IVD_CONTROL_API_COMMAND_TYPE_T;
+
+
+/* IVD_ERROR_BITS_T: A UWORD32 container will be used for reporting the error*/
+/* code to the application. The first 8 bits starting from LSB have been */
+/* reserved for the codec to report internal error details. The rest of the */
+/* bits will be generic for all video decoders and each bit has an associated*/
+/* meaning as mentioned below. The unused bit fields are reserved for future */
+/* extenstions and will be zero in the current implementation */
+
+typedef enum {
+ /* Bit 8 - Applied concealment. */
+ IVD_APPLIEDCONCEALMENT = 0x8,
+ /* Bit 9 - Insufficient input data. */
+ IVD_INSUFFICIENTDATA = 0x9,
+ /* Bit 10 - Data problem/corruption. */
+ IVD_CORRUPTEDDATA = 0xa,
+ /* Bit 11 - Header problem/corruption. */
+ IVD_CORRUPTEDHEADER = 0xb,
+ /* Bit 12 - Unsupported feature/parameter in input. */
+ IVD_UNSUPPORTEDINPUT = 0xc,
+ /* Bit 13 - Unsupported input parameter orconfiguration. */
+ IVD_UNSUPPORTEDPARAM = 0xd,
+ /* Bit 14 - Fatal error (stop the codec).If there is an */
+ /* error and this bit is not set, the error is a recoverable one. */
+ IVD_FATALERROR = 0xe,
+ /* Bit 15 - Invalid bitstream. Applies when Bitstream/YUV frame */
+ /* buffer for encode/decode call is made with non-valid or zero u4_size */
+ /* data */
+ IVD_INVALID_BITSTREAM = 0xf,
+ /* Bit 16 */
+ IVD_INCOMPLETE_BITSTREAM = 0x10,
+ IVD_ERROR_BITS_T_DUMMY_ELEMENT = 0x7FFFFFFF
+}IVD_ERROR_BITS_T;
+
+
+/* IVD_CONTROL_API_COMMAND_TYPE_T: Video Control API command type */
+typedef enum {
+ IVD_ERROR_NONE = 0x0,
+ IVD_NUM_MEM_REC_FAILED = 0x1,
+ IVD_NUM_REC_NOT_SUFFICIENT = 0x2,
+ IVD_FILL_MEM_REC_FAILED = 0x3,
+ IVD_REQUESTED_WIDTH_NOT_SUPPPORTED = 0x4,
+ IVD_REQUESTED_HEIGHT_NOT_SUPPPORTED = 0x5,
+ IVD_INIT_DEC_FAILED = 0x6,
+ IVD_INIT_DEC_NOT_SUFFICIENT = 0x7,
+ IVD_INIT_DEC_WIDTH_NOT_SUPPPORTED = 0x8,
+ IVD_INIT_DEC_HEIGHT_NOT_SUPPPORTED = 0x9,
+ IVD_INIT_DEC_MEM_NOT_ALIGNED = 0xa,
+ IVD_INIT_DEC_COL_FMT_NOT_SUPPORTED = 0xb,
+ IVD_INIT_DEC_MEM_REC_NOT_SUFFICIENT = 0xc,
+ IVD_GET_VERSION_DATABUFFER_SZ_INSUFFICIENT = 0xd,
+ IVD_BUFFER_SIZE_SET_TO_ZERO = 0xe,
+ IVD_UNEXPECTED_END_OF_STREAM = 0xf,
+ IVD_SEQUENCE_HEADER_NOT_DECODED = 0x10,
+ IVD_STREAM_WIDTH_HEIGHT_NOT_SUPPORTED = 0x11,
+ IVD_MAX_FRAME_LIMIT_REACHED = 0x12,
+ IVD_IP_API_STRUCT_SIZE_INCORRECT = 0x13,
+ IVD_OP_API_STRUCT_SIZE_INCORRECT = 0x14,
+ IVD_HANDLE_NULL = 0x15,
+ IVD_HANDLE_STRUCT_SIZE_INCORRECT = 0x16,
+ IVD_INVALID_HANDLE_NULL = 0x17,
+ IVD_INVALID_API_CMD = 0x18,
+ IVD_UNSUPPORTED_API_CMD = 0x19,
+ IVD_MEM_REC_STRUCT_SIZE_INCORRECT = 0x1a,
+ IVD_DISP_FRM_ZERO_OP_BUFS = 0x1b,
+ IVD_DISP_FRM_OP_BUF_NULL = 0x1c,
+ IVD_DISP_FRM_ZERO_OP_BUF_SIZE = 0x1d,
+ IVD_DEC_FRM_BS_BUF_NULL = 0x1e,
+ IVD_SET_CONFG_INVALID_DEC_MODE = 0x1f,
+ IVD_SET_CONFG_UNSUPPORTED_DISP_WIDTH = 0x20,
+ IVD_RESET_FAILED = 0x21,
+ IVD_INIT_DEC_MEM_REC_OVERLAP_ERR = 0x22,
+ IVD_INIT_DEC_MEM_REC_BASE_NULL = 0x23,
+ IVD_INIT_DEC_MEM_REC_ALIGNMENT_ERR = 0x24,
+ IVD_INIT_DEC_MEM_REC_INSUFFICIENT_SIZE = 0x25,
+ IVD_INIT_DEC_MEM_REC_INCORRECT_TYPE = 0x26,
+ IVD_DEC_NUMBYTES_INV = 0x27,
+ IVD_DEC_REF_BUF_NULL = 0x28,
+ IVD_DEC_FRM_SKIPPED = 0x29,
+ IVD_RES_CHANGED = 0x2a,
+ IVD_DUMMY_ELEMENT_FOR_CODEC_EXTENSIONS = 0x300,
+}IVD_ERROR_CODES_T;
+
+
+/*****************************************************************************/
+/* Structure */
+/*****************************************************************************/
+/* structure for passing output buffers to codec during get display buffer */
+/* call */
+typedef struct {
+
+ /**
+ * number of output buffers
+ */
+ UWORD32 u4_num_bufs;
+
+ /**
+ *list of pointers to output buffers
+ */
+ UWORD8 *pu1_bufs[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+ /**
+ * sizes of each output buffer
+ */
+ UWORD32 u4_min_out_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+}ivd_out_bufdesc_t;
+
+/*****************************************************************************/
+/* Initialize decoder */
+/*****************************************************************************/
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_INIT */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * e_cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ *no memrecords which are allocated on request of codec through fill mem records
+ */
+ UWORD32 u4_num_mem_rec;
+ /**
+ * maximum height for which codec should be initialized
+ */
+ UWORD32 u4_frm_max_wd;
+ /**
+ * maximum width for which codec should be initialized
+ */
+ UWORD32 u4_frm_max_ht;
+ /**
+ * format in which codec has to give out frame data for display
+ */
+ IV_COLOR_FORMAT_T e_output_format;
+ /**
+ * pointer to memrecord array, which contains allocated resources
+ */
+ iv_mem_rec_t *pv_mem_rec_location;
+}ivd_init_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * u4_error_code
+ */
+ UWORD32 u4_error_code;
+}ivd_init_op_t;
+
+
+/*****************************************************************************/
+/* Video Decode */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_DECODE */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * e_cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * u4_ts
+ */
+ UWORD32 u4_ts;
+
+ /**
+ * u4_num_Bytes
+ */
+ UWORD32 u4_num_Bytes;
+
+ /**
+ * pv_stream_buffer
+ */
+ void *pv_stream_buffer;
+
+ /**
+ * output buffer desc
+ */
+ ivd_out_bufdesc_t s_out_buffer;
+
+}ivd_video_decode_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * u4_error_code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * num_bytes_consumed
+ */
+ UWORD32 u4_num_bytes_consumed;
+
+ /**
+ * pic_wd
+ */
+ UWORD32 u4_pic_wd;
+
+ /**
+ * pic_ht
+ */
+ UWORD32 u4_pic_ht;
+
+ /**
+ * pic_type
+ */
+ IV_PICTURE_CODING_TYPE_T e_pic_type;
+
+ /**
+ * frame_decoded_flag
+ */
+ UWORD32 u4_frame_decoded_flag;
+
+ /**
+ * new_seq
+ */
+ UWORD32 u4_new_seq;
+
+ /**
+ * output_present
+ */
+ UWORD32 u4_output_present;
+
+ /**
+ * progressive_frame_flag
+ */
+ UWORD32 u4_progressive_frame_flag;
+
+ /**
+ * is_ref_flag
+ */
+ UWORD32 u4_is_ref_flag;
+
+ /**
+ * output_format
+ */
+ IV_COLOR_FORMAT_T e_output_format;
+
+ /**
+ * disp_frm_buf
+ */
+ iv_yuv_buf_t s_disp_frm_buf;
+
+ /**
+ * fld_type
+ */
+ IV_FLD_TYPE_T e4_fld_type;
+
+ /**
+ * ts
+ */
+ UWORD32 u4_ts;
+
+ /**
+ * disp_buf_id
+ */
+ UWORD32 u4_disp_buf_id;
+}ivd_video_decode_op_t;
+
+
+/*****************************************************************************/
+/* Get Display Frame */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_GET_DISPLAY_FRAME */
+
+typedef struct
+{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * e_cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * output buffer desc
+ */
+ ivd_out_bufdesc_t s_out_buffer;
+
+}ivd_get_display_frame_ip_t;
+
+
+typedef struct
+{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * progressive_frame_flag
+ */
+ UWORD32 u4_progressive_frame_flag;
+
+ /**
+ * pic_type
+ */
+ IV_PICTURE_CODING_TYPE_T e_pic_type;
+
+ /**
+ * is_ref_flag
+ */
+ UWORD32 u4_is_ref_flag;
+
+ /**
+ * output_format
+ */
+ IV_COLOR_FORMAT_T e_output_format;
+
+ /**
+ * disp_frm_buf
+ */
+ iv_yuv_buf_t s_disp_frm_buf;
+
+ /**
+ * fld_type
+ */
+ IV_FLD_TYPE_T e4_fld_type;
+
+ /**
+ * ts
+ */
+ UWORD32 u4_ts;
+
+ /**
+ * disp_buf_id
+ */
+ UWORD32 u4_disp_buf_id;
+}ivd_get_display_frame_op_t;
+
+/*****************************************************************************/
+/* Set Display Frame */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_SET_DISPLAY_FRAME */
+
+typedef struct
+{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * num_disp_bufs
+ */
+ UWORD32 num_disp_bufs;
+
+ /**
+ * output buffer desc
+ */
+ ivd_out_bufdesc_t s_disp_buffer[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+}ivd_set_display_frame_ip_t;
+
+
+typedef struct
+{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+}ivd_set_display_frame_op_t;
+
+
+/*****************************************************************************/
+/* Release Display Frame */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_SET_DISPLAY_FRAME */
+
+typedef struct
+{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * e_cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * disp_buf_id
+ */
+ UWORD32 u4_disp_buf_id;
+}ivd_rel_display_frame_ip_t;
+
+
+typedef struct
+{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+}ivd_rel_display_frame_op_t;
+
+/*****************************************************************************/
+/* Video control Flush */
+/*****************************************************************************/
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_CTL */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd = IVD_CMD_ctl_FLUSH */
+
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+}ivd_ctl_flush_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+}ivd_ctl_flush_op_t;
+
+/*****************************************************************************/
+/* Video control reset */
+/*****************************************************************************/
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_CTL */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd = IVD_CMD_ctl_RESET */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+}ivd_ctl_reset_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+}ivd_ctl_reset_op_t;
+
+
+/*****************************************************************************/
+/* Video control Set Params */
+/*****************************************************************************/
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_CTL */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_SETPARAMS */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_SETDEFAULT */
+
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+
+ /**
+ * vid_dec_mode
+ */
+ IVD_VIDEO_DECODE_MODE_T e_vid_dec_mode;
+
+ /**
+ * disp_wd
+ */
+ UWORD32 u4_disp_wd;
+
+ /**
+ * frm_skip_mode
+ */
+ IVD_FRAME_SKIP_MODE_T e_frm_skip_mode;
+
+ /**
+ * frm_out_mode
+ */
+ IVD_DISPLAY_FRAME_OUT_MODE_T e_frm_out_mode;
+}ivd_ctl_set_config_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * u4_error_code
+ */
+ UWORD32 u4_error_code;
+}ivd_ctl_set_config_op_t;
+
+/*****************************************************************************/
+/* Video control:Get Buf Info */
+/*****************************************************************************/
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_CTL */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_GETBUFINFO */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * e_cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+}ivd_ctl_getbufinfo_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * no of display buffer sets required by codec
+ */
+ UWORD32 u4_num_disp_bufs;
+
+ /**
+ * no of input buffers required for codec
+ */
+ UWORD32 u4_min_num_in_bufs;
+
+ /**
+ * no of output buffers required for codec
+ */
+ UWORD32 u4_min_num_out_bufs;
+
+ /**
+ * sizes of each input buffer required
+ */
+ UWORD32 u4_min_in_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+ /**
+ * sizes of each output buffer required
+ */
+ UWORD32 u4_min_out_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+}ivd_ctl_getbufinfo_op_t;
+
+
+/*****************************************************************************/
+/* Video control:Getstatus Call */
+/*****************************************************************************/
+
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_CTL */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_GETPARAMS */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+}ivd_ctl_getstatus_ip_t;
+
+
+typedef struct {
+
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * no of display buffer sets required by codec
+ */
+ UWORD32 u4_num_disp_bufs;
+
+ /**
+ * u4_pic_ht
+ */
+ UWORD32 u4_pic_ht;
+
+ /**
+ * u4_pic_wd
+ */
+ UWORD32 u4_pic_wd;
+
+ /**
+ * frame_rate
+ */
+ UWORD32 u4_frame_rate;
+
+ /**
+ * u4_bit_rate
+ */
+ UWORD32 u4_bit_rate;
+
+ /**
+ * content_type
+ */
+ IV_CONTENT_TYPE_T e_content_type;
+
+ /**
+ * output_chroma_format
+ */
+ IV_COLOR_FORMAT_T e_output_chroma_format;
+
+ /**
+ * no of input buffers required for codec
+ */
+ UWORD32 u4_min_num_in_bufs;
+
+ /**
+ * no of output buffers required for codec
+ */
+ UWORD32 u4_min_num_out_bufs;
+
+ /**
+ * sizes of each input buffer required
+ */
+ UWORD32 u4_min_in_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+
+ /**
+ * sizes of each output buffer required
+ */
+ UWORD32 u4_min_out_buf_size[IVD_VIDDEC_MAX_IO_BUFFERS];
+}ivd_ctl_getstatus_op_t;
+
+
+/*****************************************************************************/
+/* Video control:Get Version Info */
+/*****************************************************************************/
+
+/* IVD_API_COMMAND_TYPE_T::e_cmd = IVD_CMD_VIDEO_CTL */
+/* IVD_CONTROL_API_COMMAND_TYPE_T::e_sub_cmd=IVD_CMD_ctl_GETVERSION */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+
+ /**
+ * pv_version_buffer
+ */
+ void *pv_version_buffer;
+
+ /**
+ * version_buffer_size
+ */
+ UWORD32 u4_version_buffer_size;
+}ivd_ctl_getversioninfo_ip_t;
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+}ivd_ctl_getversioninfo_op_t;
+
+#endif /* __IVD_H__ */
+
diff --git a/common/mips/ihevc_func_selector.h b/common/mips/ihevc_func_selector.h
new file mode 100644
index 0000000..8188178
--- /dev/null
+++ b/common/mips/ihevc_func_selector.h
@@ -0,0 +1,227 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_func_selector.h
+*
+* @brief
+* For each function decide whether to use C function, or Neon intrinsics
+* or Cortex A8 intrinsics or Neon assembly or cortex a8 assembly
+*
+* @author
+* Harish
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef __IHEVC_FUNC_SELECTOR_H__
+#define __IHEVC_FUNC_SELECTOR_H__
+
+#include "ihevc_func_types.h"
+
+#define INTER_PRED_LUMA_COPY C
+#define INTER_PRED_LUMA_HORZ C
+#define INTER_PRED_LUMA_VERT C
+#define INTER_PRED_LUMA_COPY_W16OUT C
+#define INTER_PRED_LUMA_HORZ_W16OUT C
+
+#define INTER_PRED_LUMA_VERT_W16OUT C
+#define INTER_PRED_LUMA_VERT_W16INP C
+#define INTER_PRED_LUMA_VERT_W16INP_W16OUT C
+
+#define INTER_PRED_CHROMA_COPY C
+#define INTER_PRED_CHROMA_HORZ C
+#define INTER_PRED_CHROMA_VERT C
+#define INTER_PRED_CHROMA_COPY_W16OUT C
+#define INTER_PRED_CHROMA_HORZ_W16OUT C
+#define INTER_PRED_CHROMA_VERT_W16OUT C
+#define INTER_PRED_CHROMA_VERT_W16INP C
+#define INTER_PRED_CHROMA_VERT_W16INP_W16OUT C
+
+#define WEIGHTED_PRED_UNI C
+#define WEIGHTED_PRED_BI C
+#define WEIGHTED_PRED_BI_DEFAULT C
+#define WEIGHTED_PRED_CHROMA_UNI C
+#define WEIGHTED_PRED_CHROMA_BI C
+#define WEIGHTED_PRED_CHROMA_BI_DEFAULT C
+
+#define PAD_VERT C
+#define PAD_HORZ C
+#define PAD_LEFT_LUMA C
+#define PAD_LEFT_CHROMA C
+#define PAD_RIGHT_LUMA C
+#define PAD_RIGHT_CHROMA C
+
+#define DEBLOCKING_ASM C
+#define DEBLK_LUMA_HORZ C
+#define DEBLK_LUMA_VERT C
+#define DEBLK_CHROMA_HORZ C
+#define DEBLK_CHROMA_VERT C
+
+#define SAO_BAND_OFFSET_LUMA C
+#define SAO_BAND_OFFSET_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS0_LUMA C
+#define SAO_EDGE_OFFSET_CLASS1_LUMA C
+#define SAO_EDGE_OFFSET_CLASS2_LUMA C
+#define SAO_EDGE_OFFSET_CLASS3_LUMA C
+#define SAO_EDGE_OFFSET_CLASS0_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS1_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS2_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS3_CHROMA C
+
+#define INTRA_PRED_LUMA_REF_SUBSTITUTION C
+#define INTRA_PRED_REF_FILTERING C
+#define INTRA_PRED_LUMA_PLANAR C
+#define INTRA_PRED_LUMA_DC C
+#define INTRA_PRED_LUMA_HORZ C
+#define INTRA_PRED_LUMA_VER C
+#define INTRA_PRED_LUMA_MODE_2 C
+#define INTRA_PRED_LUMA_MODE_18_34 C
+#define INTRA_PRED_LUMA_MODE_3_T0_9 C
+#define INTRA_PRED_LUMA_MODE_11_T0_17 C
+#define INTRA_PRED_LUMA_MODE_19_T0_25 C
+#define INTRA_PRED_LUMA_MODE_27_T0_33 C
+
+#define INTRA_PRED_CHROMA_PLANAR C
+#define INTRA_PRED_CHROMA_DC C
+#define INTRA_PRED_CHROMA_HOR C
+#define INTRA_PRED_CHROMA_VER C
+#define INTRA_PRED_CHROMA_MODE_2 C
+#define INTRA_PRED_CHROMA_18_34 C
+#define INTRA_PRED_CHROMA_3_T0_9 C
+#define INTRA_PRED_CHROMA_11_T0_17 C
+#define INTRA_PRED_CHROMA_19_T0_25 C
+#define INTRA_PRED_CHROMA_27_T0_33 C
+#define INTRA_PRED_CHROMA_REF_SUBSTITUTION C
+
+/* Forward transform functions */
+/* Luma */
+#define RESI_TRANS_QUANT_4X4_TTYPE1 C
+#define RESI_TRANS_QUANT_4X4 C
+#define RESI_TRANS_QUANT_8X8 C
+#define RESI_TRANS_QUANT_16X16 C
+#define RESI_TRANS_QUANT_32X32 C
+
+#define RESI_QUANT_4X4_TTYPE1 C
+#define RESI_QUANT_4X4 C
+#define RESI_QUANT_8X8 C
+#define RESI_QUANT_16X16 C
+#define RESI_QUANT_32X32 C
+
+#define RESI_TRANS_4X4_TTYPE1 C
+#define RESI_TRANS_4X4 C
+#define RESI_TRANS_8X8 C
+#define RESI_TRANS_16X16 C
+#define RESI_TRANS_32X32 C
+
+#define RESI_4X4_TTYPE1 C
+#define RESI_4X4 C
+#define RESI_8X8 C
+#define RESI_16X16 C
+#define RESI_32X32 C
+
+#define TRANS_4X4_TTYPE1 C
+#define TRANS_4X4 C
+#define TRANS_8X8 C
+#define TRANS_16X16 C
+#define TRANS_32X32 C
+
+#define QUANT_4X4_TTYPE1 C
+#define QUANT_4X4 C
+#define QUANT_8X8 C
+#define QUANT_16X16 C
+#define QUANT_32X32 C
+
+/* Chroma interleaved*/
+#define CHROMA_RESI_TRANS_QUANT_4X4 C
+#define CHROMA_RESI_TRANS_QUANT_8X8 C
+#define CHROMA_RESI_TRANS_QUANT_16X16 C
+
+#define CHROMA_RESI_QUANT_4X4 C
+#define CHROMA_RESI_QUANT_8X8 C
+#define CHROMA_RESI_QUANT_16X16 C
+
+#define CHROMA_RESI_TRANS_4X4 C
+#define CHROMA_RESI_TRANS_8X8 C
+#define CHROMA_RESI_TRANS_16X16 C
+
+#define CHROMA_RESI_4X4 C
+#define CHROMA_RESI_8X8 C
+#define CHROMA_RESI_16X16 C
+
+/* Inverse transform functions */
+/* Luma */
+#define IQUANT_ITRANS_RECON_4X4_TTYPE1 C
+#define IQUANT_ITRANS_RECON_4X4 C
+#define IQUANT_ITRANS_RECON_8X8 C
+#define IQUANT_ITRANS_RECON_16X16 C
+#define IQUANT_ITRANS_RECON_32X32 C
+
+#define IQUANT_RECON_4X4_TTYPE1 C
+#define IQUANT_RECON_4X4 C
+#define IQUANT_RECON_8X8 C
+#define IQUANT_RECON_16X16 C
+#define IQUANT_RECON_32X32 C
+
+#define ITRANS_RECON_4X4_TTYPE1 C
+#define ITRANS_RECON_4X4 C
+#define ITRANS_RECON_8X8 C
+#define ITRANS_RECON_16X16 C
+#define ITRANS_RECON_32X32 C
+
+#define RECON_4X4_TTYPE1 C
+#define RECON_4X4 C
+#define RECON_8X8 C
+#define RECON_16X16 C
+#define RECON_32X32 C
+
+#define ITRANS_4X4_TTYPE1 C
+#define ITRANS_4X4 C
+#define ITRANS_8X8 C
+#define ITRANS_16X16 C
+#define ITRANS_32X32 C
+
+/* Chroma interleaved */
+#define CHROMA_IQUANT_ITRANS_RECON_4X4 C
+#define CHROMA_IQUANT_ITRANS_RECON_8X8 C
+#define CHROMA_IQUANT_ITRANS_RECON_16X16 C
+
+#define CHROMA_IQUANT_RECON_4X4 C
+#define CHROMA_IQUANT_RECON_8X8 C
+#define CHROMA_IQUANT_RECON_16X16 C
+
+#define CHROMA_ITRANS_RECON_4X4 C
+#define CHROMA_ITRANS_RECON_8X8 C
+#define CHROMA_ITRANS_RECON_16X16 C
+
+#define CHROMA_RECON_4X4 C
+#define CHROMA_RECON_8X8 C
+#define CHROMA_RECON_16X16 C
+
+#define IHEVC_MEMCPY C
+#define IHEVC_MEMSET C
+#define IHEVC_MEMSET_16BIT C
+#define IHEVC_MEMCPY_MUL_8 C
+#define IHEVC_MEMSET_MUL_8 C
+#define IHEVC_MEMSET_16BIT_MUL_8 C
+
+#endif /* __IHEVC_FUNC_SELECTOR_H__ */
diff --git a/common/mips/ihevc_platform_macros.h b/common/mips/ihevc_platform_macros.h
new file mode 100644
index 0000000..4973239
--- /dev/null
+++ b/common/mips/ihevc_platform_macros.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+
+#define CLIP_U8(x) CLIP3((x), 0, 255)
+#define CLIP_S8(x) CLIP3((x), -128, 127)
+
+#define CLIP_U10(x) CLIP3((x), 0, 1023);
+#define CLIP_S10(x) CLIP3((x), -512, 511);
+
+#define CLIP_U12(x) CLIP3((x), 0, 4095);
+#define CLIP_S12(x) CLIP3((x), -2048, 2047);
+
+#define CLIP_U16(x) CLIP3((x), 0, 65535)
+#define CLIP_S16(x) CLIP3((x), -32768, 32767)
+
+#define ITT_BIG_ENDIAN(x) ((x << 24)) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+
+#define SHL(x,y) ((x) << (y))
+#define SHR(x,y) ((x) >> (y))
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+
+static inline UWORD32 CLZ(UWORD32 x)
+{
+ asm("clz %0, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static inline UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+#define INLINE
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */
diff --git a/common/x86/ihevc_16x16_itrans_recon_sse42_intr.c b/common/x86/ihevc_16x16_itrans_recon_sse42_intr.c
new file mode 100644
index 0000000..afdca10
--- /dev/null
+++ b/common/x86/ihevc_16x16_itrans_recon_sse42_intr.c
@@ -0,0 +1,3337 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_16x16_itrans_recon_x86_intr.c
+ *
+ * @brief
+ * Contains function definitions for inverse
+ * transform and reconstruction for 16x16.
+ *
+ * @author
+ * 100470
+ * 100592 (edited by)
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_16x16_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 16x16 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_26;
+ __m128i m_temp_reg_27;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_temp_reg_37;
+ __m128i m_temp_reg_40;
+ __m128i m_temp_reg_41;
+ __m128i m_temp_reg_42;
+ __m128i m_temp_reg_43;
+ __m128i m_temp_reg_44;
+ __m128i m_temp_reg_45;
+ __m128i m_temp_reg_46;
+ __m128i m_temp_reg_47;
+
+ __m128i m_temp_reg_70;
+ __m128i m_temp_reg_71;
+ __m128i m_temp_reg_72;
+ __m128i m_temp_reg_73;
+ __m128i m_temp_reg_74;
+ __m128i m_temp_reg_75;
+ __m128i m_temp_reg_76;
+ __m128i m_temp_reg_77;
+ __m128i m_rdng_factor;
+ __m128i m_count;
+ __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+ __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+ WORD32 i;
+
+ WORD32 zero_last8_cols_stg1;
+ WORD32 zero_last8_rows_stg1;
+ WORD32 zero_last12_rows_stg1;
+ WORD32 zero_last12_rows_stg2;
+ WORD32 zero_last8_rows_stg2;
+
+ WORD32 loop = 0;
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ WORD32 trans_size = TRANS_SIZE_16;
+
+ /* Following 3 instructions replicates the value in the */
+ /* lower 16 bits of m_add_iq in the entire register */
+
+ /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+
+ zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
+ zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
+ zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
+
+ zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
+ zero_last8_rows_stg2 = zero_last8_cols_stg1;
+
+ if(zero_last8_cols_stg1)
+ {
+ loop = 1;
+ }
+ else
+ loop = 2;
+
+ /* i = 0 => lower 8 samples */
+ /* i = 1 => higher 8 samples */
+ for(i = 0; i < loop; i++)
+ {
+ {
+ WORD32 sample_half_index = i << 3;
+ WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
+ WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+
+
+
+
+ /* If last 12 rows are zero : Rishab */
+ if(zero_last12_rows_stg1)
+ {
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+ }
+
+ /* eo */
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+ }
+ /* If last 8 rows are zero : Rishab */
+ else if(zero_last8_rows_stg1)
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+ //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+ }
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+ } /* If all the rows are non-zero : Rishab */
+ else
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+ }
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ }
+ }
+
+ {
+ WORD32 sample_half_index = i << 3;
+ WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ }
+
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = trans_size << 1;
+
+ if(zero_last12_rows_stg1)
+ {
+ for(j = 0; j < 2; j++)
+ {
+ if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ }
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+ }
+ }
+ else if(zero_last8_rows_stg1)
+ {
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ }
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+ }
+
+ }
+ else
+ {
+
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+ }
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+ }
+ }
+ }
+
+ /* Transpose */
+ {
+ WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+ WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = (trans_size << 1);
+ WORD32 j;
+
+ for(j = 0; j < 2; j++)
+ {
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+ pi2_src_scratch += in_stride;
+ m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+ pi2_src_scratch += in_stride;
+ m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+ pi2_src_scratch += 8;
+ m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+ pi2_src_scratch += 8;
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+ m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+ m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+ m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+ m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+ m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+ m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+ m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+ m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+ m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+
+ m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+ m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+
+ m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+ m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
+
+ m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
+ m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
+ pi2_dst_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
+ pi2_dst_scratch += 8;
+ }
+ }
+ }
+
+ if(zero_last8_cols_stg1)
+ {
+ WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 j;
+
+ m_temp_reg_40 = _mm_setzero_si128();
+ for(j = 0; j < 2; j++)
+ {
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += 8;
+ }
+ }
+
+
+
+
+ /* Stage 2 */
+ for(i = 0; i < 2; i++)
+ {
+ //__m128i m_temp_reg_15,m_temp_reg_16;
+ WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
+ WORD32 stride = (trans_size);
+ WORD16 temp_array[256];
+
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ if(zero_last12_rows_stg2)
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+
+ pi2_src_temp += (stride * 9);
+
+ if(!i)
+ {
+ pi2_src_temp += (stride * 6 + 8);
+ }
+ else
+ {
+ pi2_src_temp += (stride * 2 + 8);
+ }
+
+ pi2_src_temp -= (stride * 9);
+
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+ m_temp_reg_20 = _mm_setzero_si128();
+ m_temp_reg_22 = _mm_setzero_si128();
+
+ m_temp_reg_21 = _mm_setzero_si128();
+ m_temp_reg_23 = _mm_setzero_si128();
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+
+ /* Loading coeff and src for use in next block */
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+ /* */
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
+ }
+
+ /* eo */
+ {
+ WORD16 *pi2_scratch = temp_array;
+ WORD32 out_stride = 8;
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][0-3] stored in pu1_dst[0] */
+ /* e[7][0-3] stored in pu1_dst[1] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][4-7] stored in pu1_dst[2] */
+ /* e[7][4-7] stored in pu1_dst[3] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][0-3] stored in pu1_dst[4] */
+ /* e[6][0-3] stored in pu1_dst[5] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][4-7] stored in pu1_dst[6]*/
+ /* e[6][4-7] stored in pu1_dst[7] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pu1_dst[8]*/
+ /* e[5][0-3] stored in pu1_dst[9] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* e[2][4-7] stored in pu1_dst[10]*/
+ /* e[5][4-7] stored in pu1_dst[11] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pu1_dst[12]*/
+ /* e[4][0-3] stored in pu1_dst[13] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* e[3][4-7] stored in pu1_dst[14]*/
+ /* e[4][4-7] stored in pu1_dst[15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ }
+ }
+ else if(zero_last8_rows_stg2)
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+ pi2_src_temp += (stride * 8);
+
+ if(!i)
+ {
+ pi2_src_temp += (stride * 6 + 8);
+ }
+ else
+ {
+ pi2_src_temp += (stride * 2 + 8);
+ }
+
+ pi2_src_temp -= (stride * 8);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+
+ m_temp_reg_76 = _mm_setzero_si128();
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+
+
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ }
+
+ /* eo */
+ {
+ WORD16 *pi2_scratch = temp_array;
+ WORD32 out_stride = 8;
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+ /* e[0][0-3] stored in pu1_dst[0] */
+ /* e[7][0-3] stored in pu1_dst[1] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pu1_dst[2] */
+ /* e[7][4-7] stored in pu1_dst[3] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pu1_dst[4] */
+ /* e[6][0-3] stored in pu1_dst[5] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pu1_dst[6]*/
+ /* e[6][4-7] stored in pu1_dst[7] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pu1_dst[8]*/
+ /* e[5][0-3] stored in pu1_dst[9] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* e[2][4-7] stored in pu1_dst[10]*/
+ /* e[5][4-7] stored in pu1_dst[11] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pu1_dst[12]*/
+ /* e[4][0-3] stored in pu1_dst[13] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* e[3][4-7] stored in pu1_dst[14]*/
+ /* e[4][4-7] stored in pu1_dst[15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+ }
+ }
+
+ else
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+ pi2_src_temp += (stride * 7);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
+ pi2_src_temp += (stride);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
+ if(!i)
+ {
+ pi2_src_temp += (stride * 6 + 8);
+ }
+ else
+ {
+ pi2_src_temp += (stride * 2 + 8);
+ }
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
+ pi2_src_temp -= (stride);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
+ pi2_src_temp -= (stride * 7);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+ }
+
+ /* eo */
+ {
+ WORD16 *pi2_scratch = temp_array;
+ WORD32 out_stride = 8;
+
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+
+ }
+
+ /* eo0[4-7] */
+ {
+
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+ }
+ }
+
+ if(zero_last12_rows_stg2)
+ {
+ /* o & stage 2 pre-transposed out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = temp_array;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+ WORD32 out_stride = (trans_size);
+ WORD32 in_stride = (8) * 4;
+
+ pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+
+ pi2_src_temp += (stride * 9);
+
+ if(0 == i)
+ {
+ pi2_src_temp -= (stride * 2 - 8);
+ }
+ else
+ {
+ pi2_src_temp -= (stride * 6 - 8);
+ }
+ pi2_src_temp -= (stride * 9);
+
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ }
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+
+ }
+ }
+ }
+ else if(zero_last8_rows_stg2)
+ {
+ /* o & stage 2 pre-transposed out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = temp_array;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+ WORD32 out_stride = (trans_size);
+ WORD32 in_stride = (8) * 4;
+
+ pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+ pi2_src_temp += (stride * 8);
+
+ if(0 == i)
+ {
+ pi2_src_temp -= (stride * 2 - 8);
+ }
+ else
+ {
+ pi2_src_temp -= (stride * 6 - 8);
+ }
+
+ pi2_src_temp -= (stride * 8);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ }
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+ }
+ }
+ }
+ else
+ {
+ /* o & stage 2 pre-transposed out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = temp_array;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+ WORD32 out_stride = (trans_size);
+ WORD32 in_stride = (8) * 4;
+
+ pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+ pi2_src_temp += (stride * 7);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
+ pi2_src_temp += (stride);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
+ if(0 == i)
+ {
+ pi2_src_temp -= (stride * 2 - 8);
+ }
+ else
+ {
+ pi2_src_temp -= (stride * 6 - 8);
+ }
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
+ pi2_src_temp -= (stride);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
+ pi2_src_temp -= (stride * 7);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+ for(j = 0; j < 2; j++)
+ {
+
+ if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+ }
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ }
+ }
+ }
+ }
+
+ /* Transpose */
+ {
+ WORD16 *pi2_src_scratch;
+ UWORD8 *pu1_pred_temp = pu1_pred;
+ WORD32 out_stride = dst_strd;
+ WORD32 in_stride = trans_size;
+ WORD32 j;
+ m_temp_reg_1 = _mm_setzero_si128();
+ for(i = 0; i < 2; i++)
+ {
+ pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
+
+ for(j = 0; j < 2; j++)
+ {
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+ pi2_src_scratch += ((!i) * in_stride + 8);
+ m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+ pi2_src_scratch += (in_stride);
+ m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+ pi2_src_scratch += (i * in_stride + 8);
+ m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+ pi2_src_scratch += in_stride;
+ m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+ pi2_src_scratch += ((!i) * in_stride + 8);
+ m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+ pi2_src_scratch += in_stride;
+ m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+ pi2_src_scratch += (i * in_stride + 8);
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+ m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+ m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+ m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+ m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+ m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+ m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+ m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+ m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+
+ m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+ m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
+ m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+
+ m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+ m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
+ m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+
+ m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+ m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
+ m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+ }
+ }
+ }
+}
diff --git a/common/x86/ihevc_32x32_itrans_recon_sse42_intr.c b/common/x86/ihevc_32x32_itrans_recon_sse42_intr.c
new file mode 100644
index 0000000..ec8c5c1
--- /dev/null
+++ b/common/x86/ihevc_32x32_itrans_recon_sse42_intr.c
@@ -0,0 +1,6636 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_32x32_itrans_recon_x86_intr.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_32x32_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 16x16 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/**/
+
+void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ /* Inverse Transform */
+
+ WORD32 j;
+
+
+ WORD16 *pi2_tmp_orig;
+
+
+ WORD16 *o_temp_ptr;
+ WORD16 *temp_ptr;
+
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_16;
+ __m128i m_temp_reg_17;
+ __m128i m_temp_reg_18;
+ __m128i m_temp_reg_19;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_temp_reg_37;
+ __m128i m_temp_reg_40;
+ __m128i m_temp_reg_41;
+ __m128i m_temp_reg_42;
+ __m128i m_temp_reg_43;
+ __m128i m_temp_reg_44;
+ __m128i m_temp_reg_45;
+ __m128i m_temp_reg_46;
+ __m128i m_temp_reg_47;
+
+ __m128i m_temp_reg_70;
+ __m128i m_temp_reg_71;
+ __m128i m_temp_reg_72;
+ __m128i m_temp_reg_73;
+ __m128i m_temp_reg_74;
+ __m128i m_temp_reg_75;
+ __m128i m_temp_reg_76;
+ __m128i m_temp_reg_77;
+
+ __m128i m_temp_reg_80;
+ __m128i m_temp_reg_81;
+ __m128i m_temp_reg_82;
+ __m128i m_temp_reg_83;
+ __m128i m_temp_reg_84;
+ __m128i m_temp_reg_85;
+ __m128i m_temp_reg_86;
+ __m128i m_temp_reg_87;
+
+ __m128i m_temp_reg_90;
+ __m128i m_temp_reg_91;
+ __m128i m_temp_reg_92;
+ __m128i m_temp_reg_93;
+ __m128i m_temp_reg_94;
+ __m128i m_temp_reg_95;
+ __m128i m_temp_reg_96;
+ __m128i m_temp_reg_97;
+
+ __m128i m_rdng_factor;
+ __m128i m_count;
+ __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+ __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+ __m128i temp1, temp2, temp3, temp4;
+ __m128i temp5, temp6, temp7, temp8;
+
+ __m128i all_zero_reg;
+ WORD32 i;
+
+ /*Lokesh*/
+ WORD32 zero_last24_cols_stg1;
+ WORD32 zero_last24_rows_stg1;
+ WORD32 zero_last28_rows_stg1;
+
+ WORD32 zero_last28_rows_stg2;
+ WORD32 zero_last24_rows_stg2;
+
+ WORD32 trans_size_stg1;
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ WORD32 trans_size = TRANS_SIZE_32;
+
+
+ /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+ zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+ zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+ zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+
+ zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+ zero_last24_rows_stg2 = zero_last24_cols_stg1;
+
+ if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
+ {
+ trans_size_stg1 = 8;
+
+ }
+ else
+ {
+ trans_size_stg1 = 32;
+ }
+
+ all_zero_reg = _mm_setzero_si128();
+
+ o_temp_ptr = pi2_tmp;
+ temp_ptr = (pi2_tmp + 1024);
+
+ pi2_tmp += 2048;
+ pi2_tmp_orig = pi2_tmp;
+
+ for(i = 0; i < trans_size_stg1; i += 8)
+ {
+
+ {
+ WORD16 *pi2_tmp_src = pi2_src;
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+
+ m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ }
+
+ if(zero_last28_rows_stg1)
+ {
+ /* eeo */
+ /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20 */
+/* eeeo[1]= m_temp_reg_21 */
+/* eeee[0]= m_temp_reg_22 */
+/* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20 */
+/* eeeo[1]= m_temp_reg_21 */
+/* eeee[0]= m_temp_reg_22 */
+/* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_44 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_47 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+ }
+ /* eo */
+ {
+ WORD16 *pi2_scratch = o_temp_ptr;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /**************************************************************************/
+
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo4[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /***********************************************************************/
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo5[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo6[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ }
+ }
+ else if(zero_last24_rows_stg1)
+ {
+ {
+ /* eeo */
+ /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* for row 4 to 7 */
+
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_44 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_47 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+ // eeo[]
+ /* for(k = 0; k < 4; k++) */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+ m_temp_reg_33 = _mm_setzero_si128();
+
+ /* eeo */
+ {
+ /* eeo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_90 = m_temp_reg_34;
+ m_temp_reg_97 = m_temp_reg_35;
+ }
+ /* eeo0[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ m_temp_reg_91 = m_temp_reg_34;
+ m_temp_reg_96 = m_temp_reg_35;
+
+ }
+
+ /* eeo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ m_temp_reg_92 = m_temp_reg_34;
+ m_temp_reg_95 = m_temp_reg_35;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ m_temp_reg_93 = m_temp_reg_34;
+ m_temp_reg_94 = m_temp_reg_35;
+
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ temp1 = m_temp_reg_34;
+ temp7 = m_temp_reg_35;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ temp2 = m_temp_reg_34;
+ temp6 = m_temp_reg_35;
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ temp3 = m_temp_reg_34;
+ temp5 = m_temp_reg_35;
+
+ }
+
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+ temp4 = m_temp_reg_34;
+ temp8 = m_temp_reg_35;
+
+
+ }
+ /* All values of ee[] array in pi2_temp */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+
+ }
+ }
+ /* eo */
+ {
+
+ WORD16 *pi2_scratch = o_temp_ptr;
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+
+ /* eo2[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /**************************************************************************/
+
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+
+ /* eo3[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo3[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo4[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /***********************************************************************/
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+
+ /* eo5[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo5[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo6[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+
+ /* eo7[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ }
+
+ }
+ else
+ {
+
+ {
+ /* eeo */
+ /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
+
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
+
+ /* for row 4 to 7 */
+
+ m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+ m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+
+ /* Interleaving row 8 and row 24*/
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
+
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
+
+
+ // eeo[]
+ /* for(k = 0; k < 4; k++) */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+ /* eeo */
+ {
+ /* eeo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ }
+
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+ m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+ m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+ m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+
+ /* eeo0[4-7] */
+ {
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ }
+
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
+
+ /* eeo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+ m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+ /* eeo1[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+ }
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
+
+ /* eeo2[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+ /* eeo2[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
+
+ /* eeo3[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+ }
+
+ /* eeo3[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+ temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+
+ /* All values of ee[] array in pi2_temp */
+
+ /* for(k = 0; k < 8; k++) */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+ }
+ }
+ /* eo */
+ {
+
+ WORD16 *pi2_scratch = o_temp_ptr;
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+ m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+ m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+ m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+ m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+ m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+ m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo0[4-7] */
+ {
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+ m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
+
+ /* eo1[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo2[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /**************************************************************************/
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
+
+ /* eo4[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo4[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /***********************************************************************/
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo5[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
+
+ /* eo6[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo6[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
+
+ /* eo7[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ }
+
+ }
+ /* All e[] are done */
+ /****************************/
+
+ {
+
+ WORD16 *pi2_tmp_src = pi2_src + src_strd;
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+
+ m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
+ }
+
+ if(zero_last28_rows_stg1)
+ {
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = o_temp_ptr;
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = trans_size;
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ }
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+ /* o1[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+ /* o2[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+ /* o5[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+ /* o7[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ }
+ else if(zero_last24_rows_stg1)
+ {
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+
+ WORD16 *pi2_src_scratch = o_temp_ptr;
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = (trans_size << 1);
+
+ WORD32 in_stride = trans_size;
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+ }
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+ /* o0[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+ /* o11[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ }
+ else
+ {
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+
+ WORD16 *pi2_src_scratch = o_temp_ptr;
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = (trans_size << 1);
+
+ WORD32 in_stride = trans_size;
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+ m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+ m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+ m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+ m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+ m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+ m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+ m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+ m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+ m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+ m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+ m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+ m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+ temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+ temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+ temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+ temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+
+ /* o8[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ }
+ /* Transpose */
+ {
+ WORD16 *pi2_src_scratch = temp_ptr;
+ WORD16 *pi2_dst_scratch = pi2_tmp;
+ WORD32 in_stride = (trans_size << 1);
+
+ for(j = 0; j < 2; j++)
+ {
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+ m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+ m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+ m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+ m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+ m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+ m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+ m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+ m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+ /****************/
+
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+ m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+ m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+ m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+ m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+ m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+ m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+ m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+ m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+ /******************/
+
+ m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
+ m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
+
+ m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
+ m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
+
+ m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
+ m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
+
+ m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
+ m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
+
+ m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
+ m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
+
+ m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
+ m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
+
+ m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
+ m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
+
+ m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
+ m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
+
+ pi2_dst_scratch += 4 * trans_size;
+ }
+ }
+ pi2_src += 8;
+// pi2_dequant_coeff +=8;
+ pi2_tmp += 8 * trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ if(trans_size_stg1 != TRANS_SIZE_32)
+ {
+ m_temp_reg_10 = _mm_setzero_si128();
+
+ for(i = trans_size_stg1; i < 32; i += 8)
+ {
+ WORD16 *pi2_dst_scratch = pi2_tmp;
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
+ _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
+
+ pi2_tmp += 8 * trans_size;
+ }
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+
+
+ for(j = 0; j < trans_size; j += 4)
+ {
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ if(zero_last28_rows_stg2)
+ {
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+ m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+ /* eo1[0-3] */
+ {
+ m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ }
+ /* eo2[0-3] */
+ {
+ m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+ }
+ /* eo4[0-3] */
+ {
+ m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ }
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+ }
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+ }
+ /* eo7[0-3] */
+ {
+ m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+ }
+ }
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ /* e[]*/
+
+ temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[0] */
+ temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[15] */
+
+ temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[1] */
+ temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[14] */
+
+ temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[2] */
+ temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[13] */
+
+ temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[3] */
+ temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[12] */
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[4] */
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[11] */
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[5] */
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[10] */
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[6] */
+ m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[9] */
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[7] */
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[8] */
+
+ /*o[k]*/
+ {
+
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+
+ }
+ else if(zero_last24_rows_stg2)
+ {
+ /* eo */
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+
+ m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+ m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+
+ /* eo3[0-3] */
+ {
+
+ m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+ /* eo6[0-3] */
+ {
+ m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+ /* eo7[0-3] */
+ {
+ m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ }
+
+ /* eeo */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
+
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+ /* eeo0[0-3] */
+ {
+ temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ /* eeo1[0-3] */
+ {
+ temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ }
+
+ /* eo2[0-3] */
+ {
+ temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+ }
+
+
+ /* eo3[0-3] */
+ {
+ temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ }
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+ //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1); /* ee[0] */
+ m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1); /* ee[7] */
+
+ m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2); /* ee[1] */
+ m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2); /* ee[6] */
+
+ m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3); /* ee[2] */
+ m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3); /* ee[5] */
+
+ m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4); /* ee[3] */
+ m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4); /* ee[4] */
+
+ /* e[]*/
+
+ temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
+ temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
+
+ temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
+ temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
+
+ temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
+ temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
+
+ temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
+ temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
+ m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
+
+ /*o[k] */
+ {
+
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ else
+ {
+ /* eo */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+
+ m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+ m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+ m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
+ m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
+ m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
+ m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
+ m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
+
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
+
+ /* eo7[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+
+ }
+
+ }
+
+ /* eeo */
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
+ m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
+ m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
+
+ /* eeo0[0-3] */
+ {
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
+
+ /* eeo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
+ m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+ m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
+
+/* eeeo[0]= m_temp_reg_20 */
+/* eeeo[1]= m_temp_reg_21 */
+/* eeee[0]= m_temp_reg_22 */
+/* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
+
+ m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1); /* ee[0] */
+ m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1); /* ee[7] */
+
+ m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2); /* ee[1] */
+ m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2); /* ee[6] */
+
+ m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3); /* ee[2] */
+ m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3); /* ee[5] */
+
+ m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4); /* ee[3] */
+ m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4); /* ee[4] */
+
+/* e[]*/
+
+ temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
+ temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
+
+ temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
+ temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
+
+ temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
+ temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
+
+ temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
+ temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
+ m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
+
+/*o[k] */
+ {
+
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = 8;
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
+
+ m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
+ m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
+ m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
+ m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
+ m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
+ m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
+ m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
+ m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+ m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+ m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+ m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+ m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+ m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+ m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+ m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+
+ /* Transpose */
+ {
+
+ WORD16 *pi2_src_scratch = temp_ptr;
+ WORD32 out_stride = dst_strd;
+ WORD32 in_stride = 8;
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+ m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+ m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+ m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+ m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+ m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+ m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+ m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+ m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+ m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+ m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+ m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+ m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+ m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+ m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+ m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+ m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+
+ m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); // row0 = 0-7
+ m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); // row1 = 0-7
+
+ m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); // row0=24-31
+ m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); // row1=24-31
+
+ m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); // row0=8-15
+ m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); // row1=8-15
+
+ m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); // row0=16-23
+ m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); // row1=16-23
+
+ m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); // row2 =0-7
+ m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); // row3 =0-7
+
+ m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); // row2=24-31
+ m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); // row3=24-31
+
+ m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); // row2=8-15
+ m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); // row3=8-15
+
+ m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); // row2=16-23
+ m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); // row3=16-23
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+ }
+ pi2_tmp += 4;
+ }
+}
+
diff --git a/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c b/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
new file mode 100644
index 0000000..1de4253
--- /dev/null
+++ b/common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
@@ -0,0 +1,486 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_intra_pred_filters_x86_intr.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* ihevc_intra_pred_chroma_planar_sse42()
+*
+* ihevc_intra_pred_chroma_dc_sse42()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+
+ __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ switch(nt)
+ {
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+
+ /* Planar filtering */
+
+/* setting vallues in registera*/
+
+// pu1_ref[2*(two_nt - 1 - row)]
+// pu1_ref[2 * (three_nt + 1)]
+// pu1_ref[2 * (two_nt + 1) + col]
+// pu1_ref[2 * (nt - 1)]
+
+ const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
+ pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
+ pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
+
+ const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
+ pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
+
+ const_temp4_4x32b = _mm_set1_epi16(nt - 1);
+ const_temp6_4x32b = _mm_set1_epi16(nt);
+ const_temp7_4x32b = _mm_set1_epi16(4);
+
+ zero_8x16b = _mm_set1_epi32(0);
+
+ if(nt % 4 == 0)
+ {
+ const_temp7_4x32b = _mm_set1_epi16(4);
+
+ for(row = 0; row < nt; row++)
+ {
+ __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
+ __m128i res_temp3_8x16b;
+
+ const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
+ pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
+ pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
+
+ const_temp3_4x32b = _mm_set1_epi16((row + 1));
+ row_8x16b = _mm_set1_epi16((nt - 1 - row));
+
+ const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
+ col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1]*/
+ res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1] + nt)*/
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+
+ for(col = 0; col < 2 * nt; col += 8)
+ {
+ __m128i src_temp_8x16b;
+
+ /* loding 8bit 16 pixles*/
+ src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
+
+ src_temp_8x16b = _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/
+
+ /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
+ res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b);
+
+ /*(col + 1) * pu1_ref[three_nt + 1]*/
+ res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b);
+
+ /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
+ res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b);
+
+ res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+ res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
+ res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
+ col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
+ } /* inner loop ends here */
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size (Chroma)
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 acc_dc_u, acc_dc_v;
+ WORD32 dc_val_u, dc_val_v;
+ WORD32 row;
+ WORD32 log2nt = 5;
+ __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
+ __m128i src_temp7, src_temp8, src_temp9, src_temp10;
+ __m128i m_zero = _mm_set1_epi32(0);
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+
+ acc_dc_u = 0;
+ acc_dc_v = 0;
+
+ /* Calculate DC value for the transform block */
+
+ m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
+
+ if(nt == 16)
+ {
+ __m128i temp_sad;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
+
+ src_temp5 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp6 = _mm_cvtepu8_epi16(src_temp4);
+ src_temp9 = _mm_cvtepu8_epi16(src_temp7);
+ src_temp10 = _mm_cvtepu8_epi16(src_temp8);
+
+ src_temp3 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_srli_si128(src_temp4, 8);
+ src_temp7 = _mm_srli_si128(src_temp7, 8);
+ src_temp8 = _mm_srli_si128(src_temp8, 8);
+
+ src_temp3 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+ src_temp7 = _mm_cvtepu8_epi16(src_temp7);
+ src_temp8 = _mm_cvtepu8_epi16(src_temp8);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+ src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+ src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+ else if(nt == 8)
+ {
+ __m128i temp_sad;
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+
+ src_temp5 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp6 = _mm_cvtepu8_epi16(src_temp4);
+
+ src_temp3 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_srli_si128(src_temp4, 8);
+
+ src_temp3 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+ else if(nt == 4)
+ {
+ __m128i temp_sad;
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+
+ src_temp5 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp4 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ src_temp4 = _mm_cvtepi16_epi32(src_temp4);
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+
+ acc_dc_u += pu1_ref[6 * nt];
+ acc_dc_v += pu1_ref[6 * nt + 1];
+
+ acc_dc_u -= pu1_ref[4 * nt];
+ acc_dc_v -= pu1_ref[4 * nt + 1];
+
+ dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
+ dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
+
+ dc_val_u = dc_val_u | (dc_val_v << 8);
+
+ /* Fill the remaining rows with DC value*/
+
+ if(nt == 4)
+ {
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+ }
+ else if(nt == 8)
+ {
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+ }
+
+ else /* nt == 16 */
+ {
+
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ for(row = 0; row < nt; row += 8)
+ {
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
+
+ pu1_dst += 8 * dst_strd;
+ }
+
+
+ }
+
+}
diff --git a/common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c b/common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c
new file mode 100644
index 0000000..6a3883e
--- /dev/null
+++ b/common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c
@@ -0,0 +1,2633 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_chroma_intra_pred_filters_atom_intr.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* ihevc_intra_pred_chroma_planar_ssse3()
+*
+* ihevc_intra_pred_chroma_dc_ssse3()
+*
+* ihevc_intra_pred_chroma_horz_ssse3()
+*
+* ihevc_intra_pred_chroma_ver_ssse3()
+*
+* ihevc_intra_pred_chroma_mode2_ssse3()
+*
+* ihevc_intra_pred_chroma_mode_18_34_ssse3()
+*
+* ihevc_intra_pred_chroma_mode_3_to_9_ssse3()
+*
+* ihevc_intra_pred_chroma_mode_11_to_17_ssse3()
+*
+* ihevc_intra_pred_chroma_mode_19_to_25_ssse3()
+*
+* ihevc_intra_pred_chroma_mode_27_to_33_ssse3()
+*
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_intra_pred.h"
+
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include <immintrin.h>
+
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+#define T16C_4NT 64
+#define T8C_4NT 32
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+
+#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, col;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+
+ __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
+ UNUSED(src_strd);
+ UNUSED(mode);
+ switch(nt)
+ {
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+
+ /* Planar filtering */
+
+/* setting vallues in registera*/
+
+// pu1_ref[2*(two_nt - 1 - row)]
+// pu1_ref[2 * (three_nt + 1)]
+// pu1_ref[2 * (two_nt + 1) + col]
+// pu1_ref[2 * (nt - 1)]
+
+ const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
+ pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
+ pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
+
+ const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
+ pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
+
+ const_temp4_4x32b = _mm_set1_epi16(nt - 1);
+ const_temp6_4x32b = _mm_set1_epi16(nt);
+ const_temp7_4x32b = _mm_set1_epi16(4);
+
+ zero_8x16b = _mm_set1_epi32(0);
+
+
+ if(nt % 4 == 0)
+ {
+ const_temp7_4x32b = _mm_set1_epi16(4);
+
+ for(row = 0; row < nt; row++)
+ {
+ __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
+ __m128i res_temp3_8x16b;
+
+ const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
+ pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
+ pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
+
+ const_temp3_4x32b = _mm_set1_epi16((row + 1));
+ row_8x16b = _mm_set1_epi16((nt - 1 - row));
+
+ const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
+ col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1]*/
+ res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1] + nt)*/
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+
+ for(col = 0; col < 2 * nt; col += 8)
+ {
+ __m128i src_temp_8x16b;
+
+ /* loding 8bit 16 pixles*/
+ src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
+
+ //src_temp_8x16b = _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/
+ src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b);
+
+ /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
+ res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b);
+
+ /*(col + 1) * pu1_ref[three_nt + 1]*/
+ res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b);
+
+ /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
+ res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b);
+
+ res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+ res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
+ res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
+ col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
+ } /* inner loop ends here */
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size (Chroma)
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 acc_dc_u, acc_dc_v;
+ WORD32 dc_val_u, dc_val_v;
+ WORD32 row;
+ WORD32 log2nt = 5;
+ __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
+ __m128i src_temp7, src_temp8, src_temp9, src_temp10;
+ __m128i m_zero = _mm_set1_epi32(0);
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+
+ acc_dc_u = 0;
+ acc_dc_v = 0;
+
+ /* Calculate DC value for the transform block */
+
+ m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
+
+ if(nt == 16)
+ {
+ __m128i temp_sad, sign_8x16b;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
+
+ src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
+ src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
+ src_temp9 = _mm_unpacklo_epi8(src_temp7, m_zero);
+ src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero);
+
+ src_temp3 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_srli_si128(src_temp4, 8);
+ src_temp7 = _mm_srli_si128(src_temp7, 8);
+ src_temp8 = _mm_srli_si128(src_temp8, 8);
+
+ src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
+ src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
+ src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero);
+ src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+ src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+ src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
+ src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
+
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+ else if(nt == 8)
+ {
+ __m128i temp_sad, sign_8x16b;
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
+
+ src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
+ src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
+
+ src_temp3 = _mm_srli_si128(src_temp3, 8);
+ src_temp4 = _mm_srli_si128(src_temp4, 8);
+
+ src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
+ src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
+ src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
+
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+ else if(nt == 4)
+ {
+ __m128i temp_sad, sign_8x16b;
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
+
+ src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
+ src_temp4 = _mm_srli_si128(src_temp3, 8);
+
+ src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+
+ src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
+ src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
+
+ temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
+ acc_dc_u = _mm_cvtsi128_si32(src_temp4);
+ acc_dc_v = _mm_cvtsi128_si32(temp_sad);
+ }
+
+
+ acc_dc_u += pu1_ref[6 * nt];
+ acc_dc_v += pu1_ref[6 * nt + 1];
+
+ acc_dc_u -= pu1_ref[4 * nt];
+ acc_dc_v -= pu1_ref[4 * nt + 1];
+
+ dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
+ dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
+
+ dc_val_u = dc_val_u | (dc_val_v << 8);
+
+ /* Fill the remaining rows with DC value*/
+
+ if(nt == 4)
+ {
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+ }
+ else if(nt == 8)
+ {
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+ }
+
+ else /* nt == 16 */
+ {
+ src_temp1 = _mm_set1_epi16(dc_val_u);
+
+ for(row = 0; row < nt; row += 8)
+ {
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
+
+ pu1_dst += 8 * dst_strd;
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Horizontal intraprediction(mode 10) with reference samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row;
+ __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ /* Replication to next rows*/
+
+ if(nt == 8)
+ {
+ for(row = 0; row < nt; row += 4)
+ {
+ temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
+ temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
+ temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
+ temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
+ temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
+ temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
+ temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
+ temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
+
+ temp2 = _mm_unpacklo_epi8(temp1, temp2);
+ temp4 = _mm_unpacklo_epi8(temp3, temp4);
+ temp6 = _mm_unpacklo_epi8(temp5, temp6);
+ temp8 = _mm_unpacklo_epi8(temp7, temp8);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8);
+
+ }
+ }
+ else if(nt == 16)
+ {
+ for(row = 0; row < nt; row += 4)
+ {
+ temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
+ temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
+
+ temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
+ temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
+
+ temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
+ temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
+
+ temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
+ temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
+
+ temp2 = _mm_unpacklo_epi8(temp1, temp2);
+ temp4 = _mm_unpacklo_epi8(temp3, temp4);
+ temp6 = _mm_unpacklo_epi8(temp5, temp6);
+ temp8 = _mm_unpacklo_epi8(temp7, temp8);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8);
+
+
+ }
+ }
+ else
+ {
+ temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]);
+ temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]);
+
+ temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]);
+ temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]);
+
+ temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]);
+ temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]);
+
+ temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]);
+ temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]);
+
+ temp2 = _mm_unpacklo_epi8(temp1, temp2);
+ temp4 = _mm_unpacklo_epi8(temp3, temp4);
+ temp6 = _mm_unpacklo_epi8(temp5, temp6);
+ temp8 = _mm_unpacklo_epi8(temp7, temp8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8);
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Horizontal intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ __m128i src_temp1;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ /* Replication to next columns*/
+ if(nt == 8)
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1);
+
+ }
+ if(nt == 16)
+ {
+ __m128i temp1, temp2;
+
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
+ temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16));
+
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
+
+ }
+ else
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
+
+
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 2 (sw angle) with reference neighboring samples
+* location pointed by 'pu1_ref' to the TU block location pointed by
+* 'pu1_dst' Refer to section 8.4.4.2.6 in the standard
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+
+
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]);
+
+ /* For the angle 45, replication is done from the corresponding angle */
+ /* intra_pred_ang = tan(angle) in q5 format */
+
+ if(nt == 4)
+ {
+ /*pu1_ref[two_nt - row - (col+1) - 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2));
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2));
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2));
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2));
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2));
+
+ }
+ else if(nt == 8)
+ {
+ /*pu1_ref[two_nt - row - (col+1) - 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
+
+
+ }
+ else
+ {
+ for(row = 0; row < nt; row += 8)
+ {
+ for(col = 0; col < 2 * nt; col += 16)
+ { /*pu1_ref[two_nt - row - (col+1) - 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row;
+ WORD32 idx = 0;
+
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+ UNUSED(src_strd);
+
+ if(mode == 34)
+ {
+ if(nt == 4)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+
+ }
+ else if(nt == 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
+
+
+ }
+ else
+ {
+ __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+ for(row = 0; row < nt; row += 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+ pu1_ref += 2 * 8;
+ pu1_dst += 8 * dst_strd;
+ }
+ }
+ }
+ else
+ {
+ if(nt == 4)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+
+
+ }
+ else if(nt == 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
+
+
+ }
+ else
+ {
+ __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+ for(row = 0; row < nt; row += 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2));
+ src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+ pu1_ref -= 2 * 8;
+ pu1_dst += 8 * dst_strd;
+ }
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+
+ WORD32 intra_pred_ang;
+
+ __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1;
+ UNUSED(src_strd);
+
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
+ const_temp_4x32b = _mm_set1_epi16(16);
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp4_4x32b = _mm_set1_epi32(4);
+
+ two_nt_4x32b = _mm_set1_epi32(1);
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+ if(nt == 4)
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(4);
+ two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
+
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+ __m128i src_values10;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5);
+
+ ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
+
+ ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+ _mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10);
+
+ fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
+ fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
+ fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
+ fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
+ temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
+ temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
+ temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+ temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+ temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+ temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
+
+ src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+ src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/
+
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/
+
+ src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/
+
+ }
+ }
+ }
+ else
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
+
+ for(col = 0; col < 2 * nt; col += 16)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+ __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5);
+
+ ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
+
+ ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+ _mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10);
+
+ fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
+ fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
+ fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
+ fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
+ temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
+ temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
+ temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+ temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+ temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+ temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/
+ fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/
+ fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/
+ fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/
+
+ temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/
+ temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/
+ temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/
+ temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/
+
+ temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
+ temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
+ temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
+ temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 4)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/
+
+ src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
+ src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
+ src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
+ src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
+
+ src_temp11_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/
+ src_temp12_8x16b = _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/
+ src_temp13_8x16b = _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/
+ src_temp14_8x16b = _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
+ src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
+ src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1);
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1);
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1);
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1);
+
+ src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+ src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
+
+ src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
+
+ src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/
+
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
+* with reference neighboring samples location pointed by 'pu1_ref' to the
+* TU block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
+ /* for ref main & side samples assignment,can be combined for */
+ /* optimzation*/
+
+ WORD32 row, col, k;
+ WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+ WORD32 ref_idx;
+
+
+ __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b;
+
+ UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
+ UWORD8 *ref_main;
+ UNUSED(src_strd);
+
+ inv_ang_sum = 128;
+
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+
+ /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+
+
+ ref_main = ref_temp + 2 * nt;
+ for(k = 0; k < (2 * (nt + 1)); k += 2)
+ {
+ ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
+ ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
+ }
+
+ ref_main = ref_temp + (2 * (nt - 1));
+ ref_idx = (nt * intra_pred_ang) >> 5;
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+
+ for(k = -2; k > (2 * ref_idx); k -= 2)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
+ ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
+ }
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ const_temp_4x32b = _mm_set1_epi16(16);
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp4_4x32b = _mm_set1_epi32(4);
+
+ two_nt_4x32b = _mm_set1_epi32(1);
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+ if(nt == 4)
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(4);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+ __m128i src_values10;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+ ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+ _mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10);
+
+ fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
+ fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
+ fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
+ fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
+ temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
+ temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
+ temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+ temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+ temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+ temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+ src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/
+
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/
+
+ src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/
+
+ }
+ }
+ }
+ else
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ for(col = 0; col < 2 * nt; col += 16)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
+ __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+ ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
+ _mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10);
+
+ fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
+ fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
+ fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
+ fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
+ temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
+ temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
+ temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
+
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+ temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+ temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+ temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/
+ fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/
+ fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/
+ fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/
+
+ temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/
+ temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/
+ temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/
+ temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/
+
+ temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
+ temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
+ temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
+ temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 4)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
+
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/
+
+ src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
+ src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
+ src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
+ src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
+
+ src_temp11_8x16b = _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/
+ src_temp12_8x16b = _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/
+ src_temp13_8x16b = _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/
+ src_temp14_8x16b = _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
+ src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
+ src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
+ src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+ src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
+
+ src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
+
+ src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/
+
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, k;
+ WORD32 intra_pred_ang, idx;
+ WORD32 inv_ang, inv_ang_sum, pos, fract;
+ WORD32 ref_main_idx, ref_idx;
+ UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
+ UWORD8 *ref_main;
+
+ __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
+ UNUSED(src_strd);
+
+ intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+ inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_main = ref_temp + 2 * nt;
+ for(k = 0; k < (2 * (nt + 1)); k += 2)
+ {
+ ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
+ ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
+ }
+
+ ref_idx = (nt * intra_pred_ang) >> 5;
+ inv_ang_sum = 128;
+ ref_main = ref_temp + (2 * (nt - 1));
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -2; k > (2 * ref_idx); k -= 2)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
+ ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
+ }
+
+ const_temp_8x16b = _mm_set1_epi16(16);
+
+ if(nt == 4) /* if nt =4*/
+ {
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+
+ two_nt_4x32b = _mm_set1_epi32(2);
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+ {
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
+ __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+ sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+ res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+ src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+ src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
+
+ /* fract = pos & (31); */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+ _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
+ _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
+
+ fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/
+ fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/
+ fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/
+ fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/
+
+ temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/
+ temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/
+ temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/
+ temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/
+
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+ temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+ temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+ temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+// inner loop starts from here
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */
+
+ src_values10 = _mm_srli_si128(src_values0, 2);
+ src_values11 = _mm_srli_si128(src_values1, 2);
+ src_values12 = _mm_srli_si128(src_values2, 2);
+ src_values13 = _mm_srli_si128(src_values3, 2);
+
+ src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+ src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+ src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+ src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+ src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+ src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+ src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/
+
+ }
+ }
+ else if(nt == 8) /* for nt = 16 case */
+ {
+ WORD32 ref_main_idx1, fract1, temp, temp1;
+ __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+ for(row = 0; row < nt; row += 2)
+ {
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values10, src_values11, src_values12, src_values13;
+
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+ temp = 32 - fract;
+ ref_main_idx = 2 * idx + 2; /* col from 0-15 */
+
+ pos = ((row + 2) * intra_pred_ang);
+ idx = pos >> 5;
+ fract1 = pos & (31);
+ temp1 = 32 - fract1;
+ ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */
+
+ fract_8x16b = _mm_set1_epi8(fract);
+ fract1_8x16b = _mm_set1_epi8(fract1);
+ temp_8x16b = _mm_set1_epi8(temp);
+ temp1_8x16b = _mm_set1_epi8(temp1);
+
+ temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+
+ /* row=0 */
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */
+
+ /* row=1 */
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8)); /* col = 8-15 */
+
+ src_values10 = _mm_srli_si128(src_values0, 2);
+ src_values11 = _mm_srli_si128(src_values1, 2);
+ src_values12 = _mm_srli_si128(src_values2, 2);
+ src_values13 = _mm_srli_si128(src_values3, 2);
+
+ src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+ src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+ src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+ src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+
+ src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+ src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+
+ src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+ src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+ /* loding 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
+
+ pu1_dst += 2 * dst_strd;
+ }
+ }
+ else if(nt == 16)
+ {
+ WORD32 temp;
+ /* unroll the col loop (inner) */
+ zero_8x16b = _mm_set1_epi16(0);
+
+ for(row = 0; row < nt; row += 1)
+ {
+ __m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b;
+ __m128i src_values10, src_values11, src_values12, src_values13;
+
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+ temp = 32 - fract;
+ ref_main_idx = 2 * idx + 2; /* col from 0-31 */
+
+ fract_8x16b = _mm_set1_epi8(fract);
+ temp_8x16b = _mm_set1_epi8(temp);
+
+ temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24)); /* col = 24-31 */
+
+ src_values10 = _mm_srli_si128(src_values0, 2);
+ src_values11 = _mm_srli_si128(src_values1, 2);
+ src_values12 = _mm_srli_si128(src_values2, 2);
+ src_values13 = _mm_srli_si128(src_values3, 2);
+
+ src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+ src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+ src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+ src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+ src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+ src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+ src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+ /* loding 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
+
+ pu1_dst += dst_strd;
+
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @par Description:
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row;
+ WORD32 pos, fract;
+ WORD32 intra_pred_ang;
+ WORD32 idx, ref_main_idx;
+
+ __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
+ UNUSED(src_strd);
+
+ intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
+ const_temp_8x16b = _mm_set1_epi16(16);
+
+ if(nt == 4) /* if nt =4*/
+ {
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+
+ two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2);
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+
+ {
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
+ __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+ sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+ res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+ src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+ src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
+
+ /* fract = pos & (31); */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+ _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
+ _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
+
+ fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/
+ fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/
+ fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/
+ fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/
+
+ temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/
+ temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/
+ temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/
+ temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/
+
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+ temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
+ temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
+ temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
+
+// inner loop starts from here
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */
+
+ src_values10 = _mm_srli_si128(src_values0, 2);
+ src_values11 = _mm_srli_si128(src_values1, 2);
+ src_values12 = _mm_srli_si128(src_values2, 2);
+ src_values13 = _mm_srli_si128(src_values3, 2);
+
+ src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+ src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+ src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+ src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+ src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+ src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+ src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/
+
+ }
+ }
+
+ else if(nt == 8) /* for nt = 16 case */
+ {
+ WORD32 ref_main_idx1, fract1, temp, temp1;
+ __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+ for(row = 0; row < nt; row += 2)
+ {
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values10, src_values11, src_values12, src_values13;
+
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+ temp = 32 - fract;
+ ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
+
+ pos = ((row + 2) * intra_pred_ang);
+ idx = pos >> 5;
+ fract1 = pos & (31);
+ temp1 = 32 - fract1;
+ ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
+
+ fract_8x16b = _mm_set1_epi8(fract);
+ fract1_8x16b = _mm_set1_epi8(fract1);
+ temp_8x16b = _mm_set1_epi8(temp);
+ temp1_8x16b = _mm_set1_epi8(temp1);
+
+ temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+ temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
+
+ /* row=0 */
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */
+
+ /* row=1 */
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8)); /* col = 8-15 */
+
+ src_values10 = _mm_srli_si128(src_values0, 2);
+ src_values11 = _mm_srli_si128(src_values1, 2);
+ src_values12 = _mm_srli_si128(src_values2, 2);
+ src_values13 = _mm_srli_si128(src_values3, 2);
+
+ src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+ src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+ src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+ src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+
+ src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+ src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+
+ src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+ src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+ /* loding 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
+
+ pu1_dst += 2 * dst_strd;
+ }
+ }
+ else if(nt == 16)
+ {
+ WORD32 temp;
+ /* unroll the col loop (inner) */
+ zero_8x16b = _mm_set1_epi16(0);
+
+ for(row = 0; row < nt; row += 1)
+ {
+ __m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b;
+ __m128i src_values10, src_values11, src_values12, src_values13;
+
+ pos = ((row + 1) * intra_pred_ang);
+ idx = pos >> 5;
+ fract = pos & (31);
+ temp = 32 - fract;
+ ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */
+
+ fract_8x16b = _mm_set1_epi8(fract);
+ temp_8x16b = _mm_set1_epi8(temp);
+
+ temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24)); /* col = 24-31 */
+
+ src_values10 = _mm_srli_si128(src_values0, 2);
+ src_values11 = _mm_srli_si128(src_values1, 2);
+ src_values12 = _mm_srli_si128(src_values2, 2);
+ src_values13 = _mm_srli_si128(src_values3, 2);
+
+ src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
+ src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
+ src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
+ src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
+ src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
+ src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
+ src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
+
+ /* loding 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
+
+ pu1_dst += dst_strd;
+
+ }
+ }
+}
diff --git a/common/x86/ihevc_deblk_ssse3_intr.c b/common/x86/ihevc_deblk_ssse3_intr.c
new file mode 100644
index 0000000..34ea090
--- /dev/null
+++ b/common/x86/ihevc_deblk_ssse3_intr.c
@@ -0,0 +1,1263 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_deblck_atom_intr.c
+*
+* @brief
+* Contains function definitions for deblocking filters
+*
+* @author
+* Rishab
+*
+* @par List of Functions:
+* - ihevc_deblk_luma_vert_ssse3()
+* - ihevc_deblk_luma_horz_ssse3()
+* - ihevc_deblk_chroma_vert_ssse3()
+* - ihevc_deblk_chroma_horz_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevc_debug.h"
+
+#include "ihevc_tables_x86_intr.h"
+
+#include <immintrin.h>
+/**
+*******************************************************************************
+*
+* @brief
+* Decision process and filtering for the luma block vertical edge.
+*
+* @par Description:
+* The decision process for the luma block vertical edge is carried out and
+* an appropriate filter is applied. The boundary filter strength, bs should
+* be greater than 0. The pcm flags and the transquant bypass flags should
+* be taken care of by the calling function.
+*
+* @param[in] pu1_src
+* Pointer to the src sample q(0,0)
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] bs
+* Boundary filter strength of q(0,0)
+*
+* @param[in] quant_param_p
+* quantization parameter of p block
+*
+* @param[in] quant_param_q
+* quantization parameter of p block
+*
+* @param[in] beta_offset_div2
+*
+*
+* @param[in] tc_offset_div2
+*
+*
+* @param[in] filter_flag_p
+* flag whether to filter the p block
+*
+* @param[in] filter_flag_q
+* flag whether to filter the q block
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_luma, beta_indx, tc_indx;
+ WORD32 beta, tc;
+ WORD32 d, dp, dq, d_sam0, d_sam3;
+
+ WORD32 d3, d0, de_0, de_1, de_2, de_3;
+ WORD32 de, dep, deq;
+ __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
+
+
+ {
+ __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
+ __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
+
+
+
+ ASSERT((bs > 0) && (bs <= 3));
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+ beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+ /* BS based on implementation can take value 3 if it is intra/inter egde */
+ /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+ /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
+ /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
+
+ tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
+
+ beta = gai4_ihevc_beta_table[beta_indx];
+ tc = gai4_ihevc_tc_table[tc_indx];
+ if(0 == tc)
+ {
+ return;
+ }
+ src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
+ src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
+
+ coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
+ mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d));
+
+ src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
+ mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
+
+ mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
+
+
+ //to get all 1's of 8 bit in (1)
+ temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
+ temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
+ //accumulating values foe dp3 dq3 , dp0 dq0 values
+ mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
+
+ temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
+ // to get all 1,-1 sets of 16 bits in (0)
+ temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
+ //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
+ mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+ //to get 16 bit 1's
+ temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
+
+
+ // dq3 dp3 dq0 dp0
+ mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
+ mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
+ mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
+ // dq dp d3 d0
+ mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
+ //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
+ mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
+ //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
+ mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+
+ ///store back in a single variable
+ temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
+ temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
+ mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
+
+ d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
+ d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+ dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
+ dq = _mm_cvtsi128_si32(mask_16x8b);
+ //getting d
+ d = d0 + d3;
+
+ ///store back in a single variable
+ temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
+ temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
+ mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
+
+ de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
+ de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+ de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
+ de_3 = _mm_cvtsi128_si32(mask_16x8b);
+
+ de = 0;
+ dep = 0;
+ deq = 0;
+ if(d < beta)
+ {
+ d_sam0 = 0;
+ if((2 * d0 < (beta >> 2))
+ && (de_2 < (beta >> 3))
+ && (de_0 < ((5 * tc + 1) >> 1)))
+ {
+ d_sam0 = 1;
+ }
+
+ d_sam3 = 0;
+ if((2 * d3 < (beta >> 2))
+ && (de_3 < (beta >> 3))
+ && de_1 < ((5 * tc + 1) >> 1))
+ {
+ d_sam3 = 1;
+ }
+
+ de = (d_sam0 & d_sam3) + 1;
+ dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ if(tc <= 1)
+ {
+ dep = 0;
+ deq = 0;
+ }
+ }
+
+ }
+
+ if(de != 0)
+ {
+
+
+ src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
+ src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
+
+ if(de == 2)
+ {
+ __m128i temp_pq_str0_16x8b;
+ __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
+ __m128i temp_pq2_str0_16x8b;
+ __m128i temp_pq_str1_16x8b;
+ __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
+ __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
+ __m128i const2_8x16b, const2tc_8x16b;
+ LWORD64 mask, tc2;
+ tc = tc << 1;
+ mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
+ tc2 = ((LWORD64)tc);
+
+ const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
+ //q'0-q'1-2 ,p'0-p'1-2
+ src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
+ src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
+
+ const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
+ temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
+ temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
+ //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
+ temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
+ temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
+
+ const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
+ //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
+ temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
+
+ temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
+
+ //q'1-2, p'1-2
+ temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
+ temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
+
+ temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+ temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+
+ temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
+ temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
+ // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
+ temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
+ // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
+ temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
+
+ temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+ temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
+
+ //clipping mask design
+ temp_str1_16x8b = _mm_setzero_si128();
+ temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
+ const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2));
+ temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
+ const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
+
+ //clipping mask design
+ temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
+ const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
+ //calculating Clipping MAX for all pixel values.
+ temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
+ temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
+
+
+ //q'2-q'0-2,p'2-p'0-2
+ temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
+ temp_str3_16x8b = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
+
+ temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
+ temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
+
+ const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
+ //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
+ temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
+
+ temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
+
+ //calculating Clipping MIN for all pixel values.
+ temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
+ temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
+ //q'0-q'1-2 ,p'0-p'1-2
+ temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
+ temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
+ //q'1-2 p'1-2
+ temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+ //to get 2 in 16 bit
+ const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
+ //to get q33 q23 q13 q03, p33 p23 p13 p03
+ temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
+ temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
+ temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
+
+ //q'1, p'1 (adding 2)
+ temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+ //q'0-q'1,p'0-p'1
+ temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
+ //q'2-q'1,p'2-p'1
+ temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
+ //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
+ temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
+ //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
+ temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
+
+ //normalisation of all modified pixels
+ temp_pq_str0_16x8b = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
+ temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
+ temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
+
+ //getting p0 p1 together and p2 p3 together
+ temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
+ temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
+ //getting q1 q0 together and q3 q2 together
+ temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
+ temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
+ //getting p's of row0 row1 together and of row2 row3 together
+ temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
+ temp_str2_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
+ //getting q's of row0 row1 together and of row2 row3 together
+ temp_str0_16x8b = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
+ temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
+ //getting values for respective rows in 16 bit
+ src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
+ src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
+ src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
+ src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
+ //packing values to 8 bit
+ src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
+ src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
+ //Clipping MAX
+ src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
+ src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
+ //Clipping MIN
+ src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
+ src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
+ //separating row 2 and row 3
+ src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
+ src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
+
+ }
+
+ else
+ {
+
+ __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
+ __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
+ __m128i coefdelta_0_8x16b, mask_pq_8x16b;
+ __m128i const2_8x16b, consttc_8x16b;
+
+ LWORD64 mask1;
+ mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
+
+ consttc_8x16b = _mm_set1_epi32(tc);
+
+
+ src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
+ src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
+
+ tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
+ tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
+
+ tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
+ tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
+ //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01
+ tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
+
+ coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
+ // (-3q1+9q0),(-9p0+3p1)
+ tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
+ //converting to 16 bit
+ consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
+ //getting -tc store
+ tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
+ //calc 10 *tc = 2*tc +8*tc ; 2*tc
+ tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
+ //calc 10 *tc = 2*tc +8*tc ; 8*tc
+ tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
+ //getting -tc store
+ tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
+ //calc 10 *tc
+ tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
+ //const 1
+ const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
+ tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
+ const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
+ //getting the mask values
+ mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
+ //loaded coef for delta1 calculation
+ coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
+ //(-2q1+q0),(p0-2p1)
+ tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
+ //const 8
+ const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
+ //rearranging the mask values
+ mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
+ //normalisation of the filter
+ tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
+ tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
+
+ //getting deltaq0
+ tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
+ //packing d3q d2q d1q d0q d3p d2p d1p d0p
+ tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
+ //absolute delta
+ tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
+ //Clipping of delta0
+ tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
+ //mask for |delta| < 10*tc
+ tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
+ //Clipping of delta0
+ tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
+
+
+ //delta 1 calc starts
+
+ //getting q32 q22 q12 q02 p32 p12 p22 p02
+ tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
+ tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
+ tmp_delta1_8x16b = _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
+ tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
+ //constant 1
+ const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
+ //tc>>1 16 bit
+ consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
+
+ //getting -tc>>1 store 16 bit
+ tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
+ //2*delta0
+ tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
+
+ //getting all respective q's and p's together
+ tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
+ tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
+ //final adds for deltap1 and deltaq1
+ tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
+ tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
+ tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
+ tmp2_const_8x16b = _mm_setzero_si128();
+ tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
+
+ // clipping delta1
+ tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
+ // clipping delta1
+ tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
+
+ //getting the mask ready
+ mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
+ //masking of the delta values |delta|<10*tc
+ tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
+ tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
+ //packing dq1 dq0 dp0 dp1
+ tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
+ tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
+ tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
+ tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
+
+ //masking of the delta values dep, deq , filter_p ,filter_q
+ tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
+ tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
+ //converting 8bit to 16 bit
+ src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
+ src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
+ src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
+ src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
+ //shuffle values loaded
+ tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
+ tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
+ //arranging each row delta in different registers
+ tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
+ tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
+ tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
+ tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
+
+ //adding the respective delta
+ src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
+ src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
+ src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
+ src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
+ //saturating to 8 bit
+ src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
+ src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
+ //separating different rows
+ src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
+ src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
+ }
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
+ _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
+ _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
+ _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
+ }
+}
+
+void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_luma, beta_indx, tc_indx;
+ WORD32 beta, tc;
+
+ WORD32 d0, d3, dp, dq, d;
+ WORD32 de_0, de_1, de_2, de_3;
+ WORD32 d_sam0, d_sam3;
+ WORD32 de, dep, deq;
+
+ __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
+ __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
+
+
+
+
+ {
+ __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
+ __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
+ __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
+
+ ASSERT((bs > 0));
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
+ beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
+
+ /* BS based on implementation can take value 3 if it is intra/inter egde */
+ /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
+ /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
+ /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
+
+ tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
+
+ beta = gai4_ihevc_beta_table[beta_indx];
+ tc = gai4_ihevc_tc_table[tc_indx];
+ if(0 == tc)
+ {
+ return;
+ }
+ src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+ src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
+ src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
+ src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
+ tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
+ src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
+ tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
+
+
+ src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
+ src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
+
+ src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
+ src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
+
+ src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
+ src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
+
+ src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
+ src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
+
+ coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
+ mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d));
+
+ src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
+ //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
+ mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
+
+ mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
+
+
+ //to get all 1's of 8 bit in (1)
+ temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
+ temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
+ //accumulating values foe dp3 dq3 , dp0 dq0 values
+ mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
+
+ temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
+ // to get all 1,-1 sets of 16 bits in (0)
+ temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
+ //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
+ mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+ //to get 16 bit 1's
+ temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
+
+
+ // dq3 dp3 dq0 dp0
+ mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
+ mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
+ mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
+ // dq dp d3 d0
+ mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
+ //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
+ mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
+ //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
+ mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
+
+ ///store back in a single variable
+ temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
+ temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
+ mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
+
+ d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
+ d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+ dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
+ dq = _mm_cvtsi128_si32(mask_16x8b);
+ //getting d
+ d = d0 + d3;
+
+ ///store back in a single variable
+ temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
+ temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
+ mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
+
+ de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
+ de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
+ de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
+ de_3 = _mm_cvtsi128_si32(mask_16x8b);
+
+ de = 0;
+ dep = 0;
+ deq = 0;
+ if(d < beta)
+ {
+ d_sam0 = 0;
+ if((2 * d0 < (beta >> 2))
+ && (de_2 < (beta >> 3))
+ && (de_0 < ((5 * tc + 1) >> 1)))
+ {
+ d_sam0 = 1;
+ }
+
+ d_sam3 = 0;
+ if((2 * d3 < (beta >> 2))
+ && (de_3 < (beta >> 3))
+ && de_1 < ((5 * tc + 1) >> 1))
+ {
+ d_sam3 = 1;
+ }
+
+ de = (d_sam0 & d_sam3) + 1;
+ dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
+ if(tc <= 1)
+ {
+ dep = 0;
+ deq = 0;
+ }
+ }
+
+ }
+
+ if(de != 0)
+ {
+
+ if(2 == de)
+ {
+
+ __m128i temp_pq0_str0_16x8b;
+ __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
+ __m128i temp_pq2_str0_16x8b;
+ __m128i temp_str0_16x8b, temp_str1_16x8b;
+ __m128i const2_8x16b, const2tc_8x16b;
+
+ LWORD64 mask, tc2;
+ tc = tc << 1;
+ mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
+ tc2 = ((LWORD64)tc);
+
+ const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
+ //q'0-q'1-2 ,p'0-p'1-2
+ temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
+ temp_str0_16x8b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
+ const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
+ //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
+ temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
+
+ const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
+ temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
+
+ //q'1-2, p'1-2
+ temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
+ temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
+ temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
+ // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
+ temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
+ // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
+ temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
+
+ temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+ temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
+
+ //clipping mask design
+ temp_str1_16x8b = _mm_setzero_si128();
+ temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
+ const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2));
+ temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
+ const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
+
+ //clipping mask design
+ temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
+ const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
+ //calculating Clipping MAX for all pixel values.
+ src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
+ src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
+ //for clipping calc
+ src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
+ //saving the unmodified data of q1 p1 q0 p0
+ src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
+ //CLIpping MAX and MIN for q1 p1 q0 p0
+ src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
+ src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
+
+
+ //q'2-q'0-2,p'2-p'0-2
+ tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
+ temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
+ const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
+ //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
+ temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
+ src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
+ temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
+
+ //calculating Clipping MAX and MIN for p2 and q2 .
+ tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
+ tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
+ //q'0-q'1-2 ,p'0-p'1-2
+ temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
+ temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
+ //q'1-2 p'1-2
+ temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
+ //to get 2 in 16 bit
+ const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
+
+
+ //q'1, p'1 (adding 2)
+ temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
+ //q'0-q'1,p'0-p'1
+ temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
+ //q'2-q'1,p'2-p'1
+ temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
+ //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
+ temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
+ //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
+ temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
+
+ //normalisation of all modified pixels
+ temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
+ temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
+ temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
+ //q'1 p'1 q'0 p'0
+ temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
+ temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
+ //pack with the unmodified data of q2 and p2
+ src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
+ //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2 p'2
+ temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
+ src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
+ temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
+ src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
+ //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
+ src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
+ src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
+ src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
+ src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
+ src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
+ src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
+
+
+ }
+
+ else
+ {
+
+ __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
+ __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
+ __m128i coefdelta_0_8x16b;
+ __m128i const2_8x16b, consttc_8x16b;
+
+ LWORD64 maskp0, maskp1, maskq0, maskq1;
+ maskp0 = (LWORD64)filter_flag_p;
+ maskq0 = (LWORD64)filter_flag_q;
+ maskp1 = (LWORD64)dep;
+ maskq1 = (LWORD64)deq;
+ consttc_8x16b = _mm_set1_epi32(tc);
+
+ tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
+ tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
+ //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01
+ tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
+
+ coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
+ // (-3q1+9q0),(-9p0+3p1)
+ tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
+
+ //getting -tc store
+ tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
+
+ //getting tc in 16 bit
+ consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
+ //calc 10 *tc = 2*tc +8*tc ; 2*tc
+ tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
+ //calc 10 *tc = 2*tc +8*tc ; 8*tc
+ tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
+
+ //const 1
+ const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
+ //calc 10 *tc
+ tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
+ //delta0 without normalisation and clipping
+ tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
+
+ const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
+
+ //loaded coef for delta1 calculation
+ coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
+ //(-2q1+q0),(p0-2p1)
+ tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
+ //const 8
+ const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
+
+ //normalisation of the filter
+ tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
+ tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
+
+ //getting deltaq0
+ tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
+ //getting -tc
+ tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
+ //packing d03q d02q d01q d0q d03p d02p d01p d00p
+ tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
+ //absolute delta
+ tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
+
+ //Clipping of delta0
+ tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
+ //tc>>1 16 bit
+ consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
+ //Clipping of delta0
+ tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
+
+ //(-tc)>>1 16 bit
+ tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
+ //mask for |delta| < 10*tc
+ tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
+ //delta 1 calc starts
+
+ //getting q32 q22 q12 q02 p32 p12 p22 p02
+ tmp0_const_8x16b = _mm_setzero_si128();
+ src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
+ src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
+ src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
+ //constant 1
+ const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
+ //2*delta0
+ tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
+ //getting all respective q's and p's together
+ coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
+ tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
+ //final adds for deltap1 and deltaq1
+ tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
+ src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
+ tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
+ tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
+
+ //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
+ tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
+ src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
+
+ // src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
+ //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
+ src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
+ coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
+
+ src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
+ src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
+ //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
+ src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
+
+ //rearranging the mask values
+ src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
+ src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
+
+ src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
+ src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
+ src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
+ src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
+
+ //combining mask delta1
+ tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
+ // clipping delta1
+ tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
+ //combining mask delat0
+ tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
+ // clipping delta1
+ tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
+
+
+ //masking of the delta values |delta|<10*tc
+ tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
+ tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
+ //separating p and q delta 0 and addinq p0 and q0
+ tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
+ tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
+ src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
+ src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
+ src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
+ src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
+ //separating p and q delta 0 and addinq p0 and q0
+ tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
+ tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
+ src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
+ src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
+ src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
+ src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
+ //packing p1 q1 and p0 q0 to 8 bit
+ src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
+ src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
+
+ src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
+ src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
+
+
+ }
+
+
+
+ }
+
+}
+
+void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_indx_u, qp_chroma_u;
+ WORD32 qp_indx_v, qp_chroma_v;
+ WORD32 tc_indx_u, tc_u;
+ WORD32 tc_indx_v, tc_v;
+
+ __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ /* chroma processing is done only if BS is 2 */
+ /* this function is assumed to be called only if BS is 2 */
+ qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+ qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+ tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+ tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+ if(0 == tc_u && 0 == tc_v)
+ {
+ return;
+ }
+ src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
+ tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
+ src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
+ tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
+
+ {
+ LWORD64 mask_tc, mask_flag, mask;
+ __m128i delta_vu0_16x8b, delta_vu1_16x8b;
+ __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
+ __m128i min_0_16x8b;
+ __m128i const_16x8b;
+ mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
+ mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
+ mask = 0xffff00000000ffffLL;
+
+ src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
+ src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
+
+ mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
+ // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
+ // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
+ delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
+ delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
+
+ tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
+ tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
+ // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
+ // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
+ delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
+ delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
+
+ delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
+ delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
+
+ //generating offset 4
+ const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
+ // filter flag mask and tc mask
+ mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
+ mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
+
+ mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
+ mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
+ //-tc
+ min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
+ //converting const 1
+ const_16x8b = _mm_srli_epi16(const_16x8b, 15);
+
+ //filterp and filterq flag
+ mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
+ mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
+
+ //modified delta with a filter (1 -4 4 -1) available in 16 bit
+ delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
+ //converting const 4
+ const_16x8b = _mm_slli_epi16(const_16x8b, 2);
+
+ mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
+ //offset addition
+ delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
+ //eliminating q1
+ tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
+
+ const_16x8b = _mm_setzero_si128();
+ //filter after normalisation
+ delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
+ mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
+
+ //clipping MAX
+ delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
+ //getting p0 and eliminating p1
+ tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
+ //clipping MIN
+ delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
+ //getting q0
+ tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
+ //masking filter flag
+ delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
+ delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
+
+ // q-delta ,p+delta
+ tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
+ tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
+ //merging q0 and p0 of respective rows
+ delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
+ delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
+ // row 0 and row 1 packed , row2 and row3 packed
+ delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
+ delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
+ //removing older pixel values
+ src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
+ src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
+ //arranging modified pixels
+ delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
+ delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
+ delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
+ delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
+ //plugging the modified values
+ src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
+ src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
+
+
+ //geting values for row1 and row 3
+ tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
+ tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
+ _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
+ _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
+ _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
+ }
+
+
+
+}
+
+void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ WORD32 qp_indx_u, qp_chroma_u;
+ WORD32 qp_indx_v, qp_chroma_v;
+ WORD32 tc_indx_u, tc_u;
+ WORD32 tc_indx_v, tc_v;
+
+
+ __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
+
+ ASSERT(filter_flag_p || filter_flag_q);
+
+ /* chroma processing is done only if BS is 2 */
+ /* this function is assumed to be called only if BS is 2 */
+ qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
+
+ qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
+ qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
+
+ tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_u = gai4_ihevc_tc_table[tc_indx_u];
+
+ tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
+ tc_v = gai4_ihevc_tc_table[tc_indx_v];
+
+ if(0 == tc_u && 0 == tc_v)
+ {
+ return;
+ }
+ tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
+ src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
+ src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ {
+ LWORD64 mask_tc, mask_flag;
+ __m128i delta_vu0_16x8b, delta_vu1_16x8b;
+ __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
+ __m128i min_0_16x8b;
+ __m128i const_16x8b;
+ mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
+ mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
+
+ tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
+ tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
+
+ // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
+ // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
+ delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
+ delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
+
+ delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
+ delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
+
+
+ // filter flag mask and tc mask
+ mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
+ mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
+
+ //generating offset 4
+ const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
+ // filter flag mask and tc mask
+ mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
+ mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
+ //-tc
+ min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
+ //converting const 1
+ const_16x8b = _mm_srli_epi16(const_16x8b, 15);
+
+ //filterp
+ mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
+
+
+ //converting const 4
+ const_16x8b = _mm_slli_epi16(const_16x8b, 2);
+ //modified delta with a filter (1 -4 4 -1) available in 16 bit
+ delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
+
+ //filterq flag
+ mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
+ //offset addition
+ delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
+ mask_16x8b = _mm_setzero_si128();
+ //filter after normalisation
+ delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
+
+ //converting p0 to 16bit
+ src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
+ //clipping MAX
+ delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
+ //converting q0 to 16bit
+ src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
+ //clipping MIN
+ delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
+
+ //masking filter flag
+ delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
+ delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
+
+ // q-delta ,p+delta
+ src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
+ src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
+
+ // p0 and q0 packed
+ src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
+ src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
+
+
+
+ _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
+ _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
+
+ }
+
+
+}
diff --git a/common/x86/ihevc_func_selector.h b/common/x86/ihevc_func_selector.h
new file mode 100644
index 0000000..52023c2
--- /dev/null
+++ b/common/x86/ihevc_func_selector.h
@@ -0,0 +1,224 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_func_selector.h
+*
+* @brief
+* For each function decide whether to use C function, or Neon intrinsics
+* or Cortex A8 intrinsics or Neon assembly or cortex a8 assembly
+*
+* @author
+* Harish
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef __IHEVC_FUNC_SELECTOR_H__
+#define __IHEVC_FUNC_SELECTOR_H__
+
+
+#include "ihevc_func_types.h"
+
+#define INTER_PRED_LUMA_COPY C
+#define INTER_PRED_LUMA_HORZ C
+#define INTER_PRED_LUMA_VERT C
+#define INTER_PRED_LUMA_COPY_W16OUT C
+#define INTER_PRED_LUMA_HORZ_W16OUT C
+#define INTER_PRED_LUMA_VERT_W16OUT C
+#define INTER_PRED_LUMA_VERT_W16INP C
+#define INTER_PRED_LUMA_VERT_W16INP_W16OUT C
+
+#define INTER_PRED_CHROMA_COPY C
+#define INTER_PRED_CHROMA_HORZ C
+#define INTER_PRED_CHROMA_VERT C
+#define INTER_PRED_CHROMA_COPY_W16OUT C
+#define INTER_PRED_CHROMA_HORZ_W16OUT C
+#define INTER_PRED_CHROMA_VERT_W16OUT C
+#define INTER_PRED_CHROMA_VERT_W16INP C
+#define INTER_PRED_CHROMA_VERT_W16INP_W16OUT C
+
+#define WEIGHTED_PRED_UNI C
+#define WEIGHTED_PRED_BI C
+#define WEIGHTED_PRED_BI_DEFAULT C
+#define WEIGHTED_PRED_CHROMA_UNI C
+#define WEIGHTED_PRED_CHROMA_BI C
+#define WEIGHTED_PRED_CHROMA_BI_DEFAULT C
+
+#define INTRA_PRED_LUMA_REF_SUBSTITUTION C
+#define INTRA_PRED_REF_FILTERING C
+#define INTRA_PRED_LUMA_PLANAR C
+#define INTRA_PRED_LUMA_DC C
+#define INTRA_PRED_LUMA_HORZ C
+#define INTRA_PRED_LUMA_VER C
+#define INTRA_PRED_LUMA_MODE_2 C
+#define INTRA_PRED_LUMA_MODE_18_34 C
+#define INTRA_PRED_LUMA_MODE_3_TO_9 C
+#define INTRA_PRED_LUMA_MODE_11_TO_17 C
+#define INTRA_PRED_LUMA_MODE_19_TO_25 C
+#define INTRA_PRED_LUMA_MODE_27_TO_33 C
+
+
+#define INTRA_PRED_CHROMA_PLANAR C
+#define INTRA_PRED_CHROMA_DC C
+#define INTRA_PRED_CHROMA_HOR C
+#define INTRA_PRED_CHROMA_VER C
+#define INTRA_PRED_CHROMA_MODE_2 C
+#define INTRA_PRED_CHROMA_18_34 C
+#define INTRA_PRED_CHROMA_3_T0_9 C
+#define INTRA_PRED_CHROMA_11_T0_17 C
+#define INTRA_PRED_CHROMA_19_T0_25 C
+#define INTRA_PRED_CHROMA_27_T0_33 C
+#define INTRA_PRED_CHROMA_REF_SUBSTITUTION C
+
+#define PAD_VERT C
+#define PAD_HORZ C
+
+#define DEBLK_LUMA_HORZ C
+#define DEBLK_LUMA_VERT C
+#define DEBLK_CHROMA_HORZ C
+#define DEBLK_CHROMA_VERT C
+
+#define SAO_BAND_OFFSET_LUMA C
+#define SAO_BAND_OFFSET_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS0_LUMA C
+#define SAO_EDGE_OFFSET_CLASS1_LUMA C
+#define SAO_EDGE_OFFSET_CLASS2_LUMA C
+#define SAO_EDGE_OFFSET_CLASS3_LUMA C
+#define SAO_EDGE_OFFSET_CLASS0_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS1_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS2_CHROMA C
+#define SAO_EDGE_OFFSET_CLASS3_CHROMA C
+
+/* Forward transform functions */
+/* Luma */
+#define RESI_TRANS_QUANT_4X4_TTYPE1 C
+#define RESI_TRANS_QUANT_4X4 C
+#define RESI_TRANS_QUANT_8X8 C
+#define RESI_TRANS_QUANT_16X16 C
+#define RESI_TRANS_QUANT_32X32 C
+
+#define RESI_QUANT_4X4_TTYPE1 C
+#define RESI_QUANT_4X4 C
+#define RESI_QUANT_8X8 C
+#define RESI_QUANT_16X16 C
+#define RESI_QUANT_32X32 C
+
+#define RESI_TRANS_4X4_TTYPE1 C
+#define RESI_TRANS_4X4 C
+#define RESI_TRANS_8X8 C
+#define RESI_TRANS_16X16 C
+#define RESI_TRANS_32X32 C
+
+#define RESI_4X4_TTYPE1 C
+#define RESI_4X4 C
+#define RESI_8X8 C
+#define RESI_16X16 C
+#define RESI_32X32 C
+
+#define TRANS_4X4_TTYPE1 C
+#define TRANS_4X4 C
+#define TRANS_8X8 C
+#define TRANS_16X16 C
+#define TRANS_32X32 C
+
+#define QUANT_4X4_TTYPE1 C
+#define QUANT_4X4 C
+#define QUANT_8X8 C
+#define QUANT_16X16 C
+#define QUANT_32X32 C
+
+/* Chroma interleaved*/
+#define CHROMA_RESI_TRANS_QUANT_4X4 C
+#define CHROMA_RESI_TRANS_QUANT_8X8 C
+#define CHROMA_RESI_TRANS_QUANT_16X16 C
+
+#define CHROMA_RESI_QUANT_4X4 C
+#define CHROMA_RESI_QUANT_8X8 C
+#define CHROMA_RESI_QUANT_16X16 C
+
+#define CHROMA_RESI_TRANS_4X4 C
+#define CHROMA_RESI_TRANS_8X8 C
+#define CHROMA_RESI_TRANS_16X16 C
+
+#define CHROMA_RESI_4X4 C
+#define CHROMA_RESI_8X8 C
+#define CHROMA_RESI_16X16 C
+
+/* Inverse transform functions */
+/* Luma */
+#define IQUANT_ITRANS_RECON_4X4_TTYPE1 C
+#define IQUANT_ITRANS_RECON_4X4 C
+#define IQUANT_ITRANS_RECON_8X8 C
+#define IQUANT_ITRANS_RECON_16X16 C
+#define IQUANT_ITRANS_RECON_32X32 C
+
+#define IQUANT_RECON_4X4_TTYPE1 C
+#define IQUANT_RECON_4X4 C
+#define IQUANT_RECON_8X8 C
+#define IQUANT_RECON_16X16 C
+#define IQUANT_RECON_32X32 C
+
+#define ITRANS_RECON_4X4_TTYPE1 C
+#define ITRANS_RECON_4X4 C
+#define ITRANS_RECON_8X8 C
+#define ITRANS_RECON_16X16 C
+#define ITRANS_RECON_32X32 C
+
+#define RECON_4X4_TTYPE1 C
+#define RECON_4X4 C
+#define RECON_8X8 C
+#define RECON_16X16 C
+#define RECON_32X32 C
+
+#define ITRANS_4X4_TTYPE1 C
+#define ITRANS_4X4 C
+#define ITRANS_8X8 C
+#define ITRANS_16X16 C
+#define ITRANS_32X32 C
+
+/* Chroma interleaved */
+#define CHROMA_IQUANT_ITRANS_RECON_4X4 C
+#define CHROMA_IQUANT_ITRANS_RECON_8X8 C
+#define CHROMA_IQUANT_ITRANS_RECON_16X16 C
+
+#define CHROMA_IQUANT_RECON_4X4 C
+#define CHROMA_IQUANT_RECON_8X8 C
+#define CHROMA_IQUANT_RECON_16X16 C
+
+#define CHROMA_ITRANS_RECON_4X4 C
+#define CHROMA_ITRANS_RECON_8X8 C
+#define CHROMA_ITRANS_RECON_16X16 C
+
+#define CHROMA_RECON_4X4 C
+#define CHROMA_RECON_8X8 C
+#define CHROMA_RECON_16X16 C
+
+#define IHEVC_MEMCPY C
+#define IHEVC_MEMSET C
+#define IHEVC_MEMSET_16BIT C
+#define IHEVC_MEMCPY_MUL_8 C
+#define IHEVC_MEMSET_MUL_8 C
+#define IHEVC_MEMSET_16BIT_MUL_8 C
+
+#endif /* __IHEVC_FUNC_SELECTOR_H__ */
+
diff --git a/common/x86/ihevc_inter_pred_filters_sse42_intr.c b/common/x86/ihevc_inter_pred_filters_sse42_intr.c
new file mode 100644
index 0000000..154b613
--- /dev/null
+++ b/common/x86/ihevc_inter_pred_filters_sse42_intr.c
@@ -0,0 +1,607 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+
+/**
+*******************************************************************************
+* @file
+* ihevc_inter_pred_filters_x86_intr.c
+*
+* @brief
+* Contains function definitions for inter prediction interpolation filters
+* coded in x86 intrinsics
+*
+*
+* @author
+*
+*
+* @par List of Functions:
+* - ihevc_inter_pred_luma_copy_w16out_sse42()
+* - ihevc_inter_pred_chroma_copy_sse42()
+* - ihevc_inter_pred_chroma_copy_w16out_sse42()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_inter_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for copy 16bit output
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst' The output is upshifted by 6
+* bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+ UNUSED(pi1_coeff);
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 4 == 0); /* checking assumption*/
+
+ if(0 == (wd & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+ src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+ src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+ src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+ src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+ src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* storing 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
+ _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 8; /* pointer update */
+ pi2_dst += 8; /* pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wd; /* pointer update */
+ pi2_dst += 4 * dst_strd - wd; /* pointer update */
+ }
+ }
+ else /* wd = multiple of 4 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+ src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+ src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+ src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+ src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+ src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* storing 16 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
+ _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 4; /* pointer update */
+ pi2_dst += 4; /* pointer update */
+ } /* inner for loop ends here(4-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wd; /* pointer update */
+ pi2_dst += 4 * dst_strd - wd; /* pointer update */
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma interprediction filter for copy
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, wdx2;
+ __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+ UNUSED(pi1_coeff);
+ wdx2 = wd * 2;
+
+ if(0 == (ht & 3)) /* ht multiple of 4 */
+ {
+ if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 16)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ /* storing 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 16; /* pointer update */
+ pu1_dst += 16; /* pointer update */
+ } /* inner for loop ends here(16-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wdx2; /* pointer update */
+ pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
+ }
+
+ }
+ else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ /* storing 16 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 8; /* pointer update */
+ pu1_dst += 8; /* pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wdx2; /* pointer update */
+ pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ else /* wdx2 = multiple of 4 case */
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ dst0 = _mm_cvtsi128_si32(src0_16x8b);
+ dst1 = _mm_cvtsi128_si32(src1_16x8b);
+ dst2 = _mm_cvtsi128_si32(src2_16x8b);
+ dst3 = _mm_cvtsi128_si32(src3_16x8b);
+
+ /* storing 4 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
+ *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
+ *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
+
+ pu1_src += 4; /* pointer update */
+ pu1_dst += 4; /* pointer update */
+ } /* inner for loop ends here(4- output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wdx2; /* pointer update */
+ pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ }
+ else /* ht multiple of 2 */
+ {
+ if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 16)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+ /* storing 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+
+ pu1_src += 16; /* pointer update */
+ pu1_dst += 16; /* pointer update */
+ } /* inner for loop ends here(16-output values in single iteration) */
+
+ pu1_src += 2 * src_strd - wdx2; /* pointer update */
+ pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
+ }
+
+ }
+ else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+ /* storing 16 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+
+ pu1_src += 8; /* pointer update */
+ pu1_dst += 8; /* pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 2 * src_strd - wdx2; /* pointer update */
+ pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ else /* wdx2 = multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+ dst0 = _mm_cvtsi128_si32(src0_16x8b);
+ dst1 = _mm_cvtsi128_si32(src1_16x8b);
+
+
+ /* storing 4 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
+
+ pu1_src += 4; /* pointer update */
+ pu1_dst += 4; /* pointer update */
+ } /* inner for loop ends here(4- output values in single iteration) */
+
+ pu1_src += 2 * src_strd - wdx2; /* pointer update */
+ pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* chroma interprediction filter for copying 16bit output
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst' The output is upshifted by 6
+* bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, wdx2;
+ __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+ UNUSED(pi1_coeff);
+ wdx2 = wd * 2;
+
+ if(0 == (ht & 3)) /* multiple of 4 case */
+ {
+ if(0 == (wdx2 & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+ src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+ src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+ src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+ src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+ src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* storing 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
+ _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 8; /* pointer update */
+ pi2_dst += 8; /* pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wdx2; /* pointer update */
+ pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ else /* wdx2 = multiple of 4 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+ src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+ src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
+ src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
+
+ src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+ src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+ src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* storing 16 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
+ _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 4; /* pointer update */
+ pi2_dst += 4; /* pointer update */
+ } /* inner for loop ends here(4-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wdx2; /* pointer update */
+ pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ }
+ else /* ht multiple of 2 case */
+ {
+ if(0 == (wdx2 & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+ src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+ src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+
+ src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+ src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* storing 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
+ _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+
+ pu1_src += 8; /* pointer update */
+ pi2_dst += 8; /* pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 2 * src_strd - wdx2; /* pointer update */
+ pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ else /* wdx2 = multiple of 4 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+
+ src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
+ src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
+
+ src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+ src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* storing 16 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */
+ _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+
+ pu1_src += 4; /* pointer update */
+ pi2_dst += 4; /* pointer update */
+ } /* inner for loop ends here(4-output values in single iteration) */
+
+ pu1_src += 2 * src_strd - wdx2; /* pointer update */
+ pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
+ }
+ }
+ }
+}
diff --git a/common/x86/ihevc_inter_pred_filters_ssse3_intr.c b/common/x86/ihevc_inter_pred_filters_ssse3_intr.c
new file mode 100644
index 0000000..ffdab4c
--- /dev/null
+++ b/common/x86/ihevc_inter_pred_filters_ssse3_intr.c
@@ -0,0 +1,5608 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+*******************************************************************************
+* @file
+* ihevc_inter_pred_filters_atom_intr.c
+*
+* @brief
+* Contains function definitions for inter prediction interpolation filters
+* coded in x86 intrinsics
+*
+*
+* @author
+*
+*
+* @par List of Functions:
+* - ihevc_inter_pred_luma_copy_ssse3()
+* - ihevc_inter_pred_luma_horz_ssse3()
+* - ihevc_inter_pred_luma_vert_ssse3()
+* - ihevc_inter_pred_luma_copy_w16out_ssse3()
+* - ihevc_inter_pred_luma_horz_w16out_ssse3()
+* - ihevc_inter_pred_luma_vert_w16out_ssse3()
+* - ihevc_inter_pred_luma_vert_w16inp_ssse3()
+* - ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3()
+* - ihevc_inter_pred_chroma_copy_ssse3()
+* - ihevc_inter_pred_chroma_horz_ssse3()
+* - ihevc_inter_pred_chroma_vert_ssse3()
+* - ihevc_inter_pred_chroma_copy_w16out_ssse3()
+* - ihevc_inter_pred_chroma_horz_w16out_ssse3()
+* - ihevc_inter_pred_chroma_vert_w16out_ssse3()
+* - ihevc_inter_pred_chroma_vert_w16inp_ssse3()
+* - ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_inter_pred.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+
+#include <immintrin.h>
+
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma function for copy
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+* Assumption : ht%4 == 0, wd%4 == 0
+*
+*******************************************************************************
+*/
+
+
+void ihevc_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+
+ WORD32 row, col;
+ __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
+ UNUSED(pi1_coeff);
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 4 == 0); /* checking assumption*/
+
+/* outer for loop starts from here */
+ if(0 == (wd & 15)) /* wd multiple of 16 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 16)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ /* storing 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 16; /* pointer update */
+ pu1_dst += 16; /* pointer update */
+ } /* inner for loop ends here(16-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wd; /* pointer update */
+ pu1_dst += 4 * dst_strd - wd; /* pointer update */
+ }
+
+ }
+ else if(0 == (wd & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ /* storing 16 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */
+
+ pu1_src += 8; /* pointer update */
+ pu1_dst += 8; /* pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wd; /* pointer update */
+ pu1_dst += 4 * dst_strd - wd; /* pointer update */
+ }
+ }
+ else /* wd = multiple of 4 case */
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */
+ src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
+ src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
+ src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
+
+ dst0 = _mm_cvtsi128_si32(src0_16x8b);
+ dst1 = _mm_cvtsi128_si32(src1_16x8b);
+ dst2 = _mm_cvtsi128_si32(src2_16x8b);
+ dst3 = _mm_cvtsi128_si32(src3_16x8b);
+
+ /* storing 4 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
+ *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
+ *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
+
+ pu1_src += 4; /* pointer update */
+ pu1_dst += 4; /* pointer update */
+ } /* inner for loop ends here(4- output values in single iteration) */
+
+ pu1_src += 4 * src_strd - wd; /* pointer update */
+ pu1_dst += 4 * dst_strd - wd; /* pointer update */
+ }
+ }
+}
+
+/* INTER_PRED_LUMA_COPY */
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for horizontal input
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+
+ /* all 128 bit registers are named with a suffix mxnb, where m is the */
+ /* number of n bits packed in the register */
+ __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+ __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+ __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+ __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b, res_temp8_8x16b;
+ __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b, res_temp18_8x16b;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+ __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+
+ PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+ /* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+ zero_8x16b = _mm_set1_epi32(0);
+ offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+
+ mask_low_32b = _mm_cmpeq_epi16(zero_8x16b, zero_8x16b);
+ mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
+ mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+ control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+ control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+ coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b); /* pi1_coeff[4] */
+ coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b); /* pi1_coeff[4] */
+
+ if(0 == (ht & 1)) /* ht multiple of 2 case */
+ {
+
+ if(0 == (wd & 7)) /* wd = multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 16 pixel values from row 0*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+
+ /*load 16 pixel values from row 1*/
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */
+ res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row =1 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row =1 */
+ src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */
+ /* row = 1 */
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row =1 */
+ src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row =1 */
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row =1 */
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */
+
+ res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+ res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+ res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+ res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b); /* row = 1 */
+ res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
+ res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b); /* row = 1 */
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
+
+ offset += 8; /* To pointer updates*/
+ }
+ pu1_src += 2 * src_strd; /* pointer updates*/
+ pu1_dst += 2 * dst_strd; /* pointer updates*/
+ }
+ }
+ else /* wd = multiple of 4 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */
+ res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */
+
+ res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+ res_temp8_8x16b = _mm_and_si128(res_temp7_8x16b, mask_low_32b);
+ res_temp7_8x16b = _mm_and_si128(res_temp5_8x16b, mask_high_96b);
+ res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */
+ src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */
+ /* row = 1 */
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */
+
+ res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+ res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+ res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+ res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b); /* row = 1 */
+ res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
+ res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b); /* row = 1 */
+
+ res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
+ res_temp18_8x16b = _mm_and_si128(res_temp17_8x16b, mask_low_32b);
+ res_temp17_8x16b = _mm_and_si128(res_temp15_8x16b, mask_high_96b);
+ res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
+
+ offset += 4; /* To pointer updates*/
+ }
+ pu1_src += 2 * src_strd; /* Pointer update */
+ pu1_dst += 2 * dst_strd; /* Pointer update */
+ }
+ }
+ }
+ else /* odd ht */
+ {
+ if(0 == (wd & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ int offset = 0;
+
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */
+ res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+ offset += 8; /* To pointer updates*/
+ }
+ pu1_src += src_strd; /* pointer updates*/
+ pu1_dst += dst_strd; /* pointer updates*/
+ }
+ }
+ else /* wd = multiple of 4 case */
+ {
+ for(row = 0; row < (ht - 1); row += 2)
+ {
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */
+ res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */
+
+ res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+ res_temp8_8x16b = _mm_and_si128(res_temp7_8x16b, mask_low_32b);
+ res_temp7_8x16b = _mm_and_si128(res_temp5_8x16b, mask_high_96b);
+ res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */
+ src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */
+ /* row = 1 */
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */
+
+ res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+ res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+ res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+ res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b); /* row = 1 */
+ res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
+ res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b); /* row = 1 */
+
+ res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
+ res_temp18_8x16b = _mm_and_si128(res_temp17_8x16b, mask_low_32b);
+ res_temp17_8x16b = _mm_and_si128(res_temp15_8x16b, mask_high_96b);
+ res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
+
+ offset += 4; /* To pointer updates*/
+ }
+ pu1_src += 2 * src_strd; /* Pointer update */
+ pu1_dst += 2 * dst_strd; /* Pointer update */
+ }
+ { /* last repeat at outside the loop */
+ int offset = 0;
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */
+ res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */
+
+ res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+ res_temp8_8x16b = _mm_and_si128(res_temp7_8x16b, mask_low_32b);
+ res_temp7_8x16b = _mm_and_si128(res_temp5_8x16b, mask_high_96b);
+ res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
+
+ offset += 4; /* To pointer updates*/
+ }
+ }
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for vertical input
+*
+* @par Description:
+* Applies a vertcal filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_copy;
+ UWORD8 *pu1_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+ __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+ __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+ __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+ __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+ __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s17_8x16b, s18_8x16b, s19_8x16b;
+ __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s27_8x16b, s28_8x16b, s29_8x16b;
+ __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s37_8x16b, s38_8x16b, s39_8x16b;
+
+ __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+ __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+ PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+ control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+ control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+ coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b); /* pi1_coeff[4] */
+ coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b); /* pi1_coeff[4] */
+
+/* seting values in register */
+ zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
+ offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+ mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+ mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+/* outer for loop starts from here */
+ if(wd % 8 == 0)
+ { /* wd = multiple of 8 case */
+
+ pu1_src_copy = pu1_src;
+ pu1_dst_copy = pu1_dst;
+
+ for(col = 0; col < wd; col += 8)
+ {
+
+ pu1_src = pu1_src_copy + col;
+ pu1_dst = pu1_dst_copy + col;
+
+ PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+ /*load 8 pixel values.*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+ /*load 8 pixel values*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+ s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+ /*load 8 pixel values*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+ /*load 8 pixel values*/
+ s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values*/
+ s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values*/
+ s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*load 8 pixel values*/
+ s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+ /* ROW 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values*/
+ s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+ /*load 8 pixel values*/
+ s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+ /*ROW 1*/
+ s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+ s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+ s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+ s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
+
+
+ /* ROW 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values*/
+ s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+ s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+ pu1_src += (8 * src_strd);
+ pu1_dst += (4 * dst_strd);
+
+ for(row = 4; row < ht; row += 4)
+ {
+#if 1
+ PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ s3_0_16x8b = s3_2_16x8b;
+ s3_1_16x8b = s3_3_16x8b;
+ s3_2_16x8b = s3_4_16x8b;
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 4)th row*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_0_16x8b = s4_2_16x8b;
+ s4_1_16x8b = s4_3_16x8b;
+ s4_2_16x8b = s4_4_16x8b;
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+ /* row + 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 5)th row*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ /*load 8 pixel values from (cur_row + 6)th row*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+2)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+ /*row + 1*/
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+ s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row + 1)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
+
+
+ /* row + 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 7)th row*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+ s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+3)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+ s2_10_16x8b = s2_3_16x8b;
+
+ pu1_src += 4 * src_strd; /* pointer update */
+ pu1_dst += 4 * dst_strd; /* pointer update */
+ }
+ }
+ }
+ else /* wd = multiple of 8 case */
+ {
+
+ pu1_src_copy = pu1_src;
+ pu1_dst_copy = pu1_dst;
+
+ for(col = 0; col < wd; col += 4)
+ {
+
+ pu1_src = pu1_src_copy + col;
+ pu1_dst = pu1_dst_copy + col;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ /*load 8 pixel values */
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+ s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+ /*load 8 pixel values */
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+ /*load 8 pixel values */
+ s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+#if 1
+ s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+ s6_8x16b = _mm_and_si128(s5_8x16b, mask_low_32b);
+ s7_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b);
+ s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
+#endif
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
+
+ /* ROW 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+#if 1
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+ s26_8x16b = _mm_and_si128(s25_8x16b, mask_low_32b);
+ s27_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b);
+ s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
+#endif
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
+
+
+ /*ROW 1*/
+ s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+ s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+ s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+ s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+#if 1
+ s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ s16_8x16b = _mm_and_si128(s15_8x16b, mask_low_32b);
+ s17_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b);
+ s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
+#endif
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s18_8x16b);
+
+
+ /* ROW 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+ s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+#if 1
+ s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+ s36_8x16b = _mm_and_si128(s35_8x16b, mask_low_32b);
+ s37_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b);
+ s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
+#endif
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
+
+ pu1_src += (8 * src_strd);
+ pu1_dst += (4 * dst_strd);
+
+ for(row = 4; row < ht; row += 4)
+ {
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ s3_0_16x8b = s3_2_16x8b;
+ s3_1_16x8b = s3_3_16x8b;
+ s3_2_16x8b = s3_4_16x8b;
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 16 pixel values from (cur_row + 4)th row*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_0_16x8b = s4_2_16x8b;
+ s4_1_16x8b = s4_3_16x8b;
+ s4_2_16x8b = s4_4_16x8b;
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+ s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+ s6_8x16b = _mm_and_si128(s5_8x16b, mask_low_32b);
+ s7_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b);
+ s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
+
+ /* row + 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 16 pixel values from (cur_row + 5)th row*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ /*load 16 pixel values from (cur_row + 6)th row*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+ s26_8x16b = _mm_and_si128(s25_8x16b, mask_low_32b);
+ s27_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b);
+ s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+2)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
+
+
+ /*row + 1*/
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+ s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+ s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ s16_8x16b = _mm_and_si128(s15_8x16b, mask_low_32b);
+ s17_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b);
+ s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row + 1)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s18_8x16b);
+
+
+ /* row + 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 16 pixel values from (cur_row + 7)th row*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+ s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+ s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+ s36_8x16b = _mm_and_si128(s35_8x16b, mask_low_32b);
+ s37_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b);
+ s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+3)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
+
+ s2_10_16x8b = s2_3_16x8b;
+
+ pu1_src += 4 * src_strd; /* pointer update */
+ pu1_dst += 4 * dst_strd; /* pointer update */
+ }
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for copy 16bit output
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst' The output is upshifted by 6
+* bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_luma_copy_w16out_ssse3(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ __m128i s3, zero_8x16b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+ UNUSED(pi1_coeff);
+ zero_8x16b = _mm_setzero_si128();
+/* outer for loop starts from here */
+ if(wd % 8 == 0) /* wd = multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+ for(col = 0; col < wd; col += 8)
+ {
+/* row =0 */
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+ _mm_store_si128((__m128i *)(pi2_dst + offset), s3);
+
+/* row =1 */
+ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+ offset += 8; /* To pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 2 * src_strd; /* pointer update */
+ pi2_dst += 2 * dst_strd; /* pointer update */
+ }
+ }
+ else /* wd = multiple of 4 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+ for(col = 0; col < wd; col += 4)
+ {
+/* row =0 */
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
+
+/* row =1 */
+ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
+ offset += 4; /* To pointer update */
+ } /* inner for loop ends here(4-output values in single iteration) */
+
+ pu1_src += 2 * src_strd; /* pointer update */
+ pi2_dst += 2 * dst_strd; /* pointer update */
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for horizontal 16bit output
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' No downshifting or clipping is done and the output is used
+* as an input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_horz_w16out_ssse3(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+
+ /* all 128 bit registers are named with a suffix mxnb, where m is the */
+ /* number of n bits packed in the register */
+
+ __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+ __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+ __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b;
+ __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+ __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+
+ PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+ /* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+ control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+ control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+ coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b); /* pi1_coeff[4] */
+ coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b); /* pi1_coeff[4] */
+
+ if(0 == (ht & 1)) /* ht multiple of 2 case */
+ {
+
+ if(0 == (wd & 7)) /* wd = multiple of 8 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+
+#endif
+
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */
+ src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */
+ /* row = 1 */
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */
+
+ res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+ res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+ res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
+
+ offset += 8; /* To pointer updates*/
+ }
+ pu1_src += 2 * src_strd; /* pointer updates*/
+ pi2_dst += 2 * dst_strd; /* pointer updates*/
+ }
+ }
+ else /* wd = multiple of 4 case */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */
+ src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */
+ /* row = 1 */
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */
+
+ res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+ res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+ res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
+
+ offset += 4; /* To pointer updates*/
+ }
+ pu1_src += 2 * src_strd; /* Pointer update */
+ pi2_dst += 2 * dst_strd; /* Pointer update */
+ }
+ }
+ }
+ else /* odd ht */
+ {
+ if(0 == (wd & 7)) /* multiple of 8 case */
+ {
+ for(row = 0; row < ht; row++)
+ {
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+
+ offset += 8; /* To pointer updates*/
+ }
+ pu1_src += src_strd; /* pointer updates*/
+ pi2_dst += dst_strd; /* pointer updates*/
+ }
+ }
+ else /* wd = multiple of 4 case */
+ {
+ for(row = 0; row < (ht - 1); row += 2)
+ {
+ int offset = 0;
+
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */
+ src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */
+ /* row = 1 */
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */
+
+ src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */
+ src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
+ res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */
+
+ res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+ res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
+ res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
+
+ offset += 4; /* To pointer updates*/
+ }
+ pu1_src += 2 * src_strd; /* Pointer update */
+ pi2_dst += 2 * dst_strd; /* Pointer update */
+ }
+ { /* last repeat at outside the loop */
+ int offset = 0;
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */
+
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */
+ /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */
+ src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */
+ /* row = 0 */
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */
+
+ src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */
+ src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */
+ /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
+ res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */
+
+ res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
+ res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
+
+ /* to store the 1st 4 pixels res. */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
+
+ offset += 4; /* To pointer updates*/
+ }
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction luma filter for vertical 16bit output
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' No downshifting or clipping is done and the output is used as
+* an input for weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_w16out_ssse3(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_copy;
+ WORD16 *pi2_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+ __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b;
+ __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+ __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+ __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+ __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b;
+ __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b;
+ __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b;
+
+
+ __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+ control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
+ control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+ coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b); /* pi1_coeff[4] */
+ coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b); /* pi1_coeff[4] */
+
+
+/* outer for loop starts from here */
+ if((wd % 8) == 0)
+ { /* wd = multiple of 8 case */
+
+ pu1_src_copy = pu1_src;
+ pi2_dst_copy = pi2_dst;
+
+ for(col = 0; col < wd; col += 8)
+ {
+
+ pu1_src = pu1_src_copy + col;
+ pi2_dst = pi2_dst_copy + col;
+
+ PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+ /*load 8 pixel values */
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+ s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+ /*load 8 pixel values */
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+ /*load 8 pixel values */
+ s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 0*/
+ _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
+
+ /* ROW 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+ /*ROW 1*/
+ s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+ s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+ s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 1*/
+ _mm_store_si128((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
+
+
+ /* ROW 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+ pu1_src += (8 * src_strd);
+ pi2_dst += (4 * dst_strd);
+
+ for(row = 4; row < ht; row += 4)
+ {
+
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+ s3_0_16x8b = s3_2_16x8b;
+ s3_1_16x8b = s3_3_16x8b;
+ s3_2_16x8b = s3_4_16x8b;
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 4)th row*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_0_16x8b = s4_2_16x8b;
+ s4_1_16x8b = s4_3_16x8b;
+ s4_2_16x8b = s4_4_16x8b;
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 4*/
+ _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
+
+ /* row + 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 5)th row*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ /*load 8 pixel values from (cur_row + 6)th row*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+2)*/
+ _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+ /*row + 1*/
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row + 1)*/
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
+
+
+ /* row + 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 7)th row*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+3)*/
+ _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+ s2_10_16x8b = s2_3_16x8b;
+
+
+ pu1_src += 4 * src_strd; /* pointer update */
+ pi2_dst += 4 * dst_strd; /* pointer update */
+ }
+ }
+ }
+ else /* wd = multiple of 8 case */
+ {
+
+ pu1_src_copy = pu1_src;
+ pi2_dst_copy = pi2_dst;
+
+ for(col = 0; col < wd; col += 4)
+ {
+
+ pu1_src = pu1_src_copy + col;
+ pi2_dst = pi2_dst_copy + col;
+
+ PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
+
+ /*load 8 pixel values */
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
+
+ s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+ /*load 8 pixel values */
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+ /*load 8 pixel values */
+ s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
+
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
+
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 0*/
+ _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
+
+ /* ROW 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
+
+ /*load 8 pixel values */
+ s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
+
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+ /*ROW 1*/
+ s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+ s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
+
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+ s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
+
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
+
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 1*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
+
+
+ /* ROW 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values */
+ s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
+
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+ pu1_src += (8 * src_strd);
+ pi2_dst += (4 * dst_strd);
+
+ for(row = 4; row < ht; row += 4)
+ {
+
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+ s3_0_16x8b = s3_2_16x8b;
+ s3_1_16x8b = s3_3_16x8b;
+ s3_2_16x8b = s3_4_16x8b;
+
+ s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
+ s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
+ s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 4)th row*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+
+ s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
+ s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_0_16x8b = s4_2_16x8b;
+ s4_1_16x8b = s4_3_16x8b;
+ s4_2_16x8b = s4_4_16x8b;
+
+ s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 4*/
+ _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
+
+ /* row + 2*/
+ s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 5)th row*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+
+ /*load 8 pixel values from (cur_row + 6)th row*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+ s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
+
+ s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+2)*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
+
+
+ /*row + 1*/
+ s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
+ s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
+ s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+ s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
+ s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row + 1)*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
+
+
+ /* row + 3*/
+ s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 8 pixel values from (cur_row + 7)th row*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+ s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
+
+ s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+3)*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
+
+ s2_10_16x8b = s2_3_16x8b;
+
+ pu1_src += 4 * src_strd; /* pointer update */
+ pi2_dst += 4 * dst_strd; /* pointer update */
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+* Luma vertical filter for 16bit input.
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 12 and
+* clipped to lie between 0 and 255
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_w16inp_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD16 *pi2_src_copy;
+ UWORD8 *pu1_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+ __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
+ __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+ __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+ __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+ __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
+ __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
+ __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
+
+ __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ zero_8x16b = _mm_setzero_si128();
+ sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+ s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+ coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */
+
+ coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2)); /* pi1_coeff[4] */
+ coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3)); /* pi1_coeff[4] */
+
+
+/* seting values in register */
+ offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+ mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+ mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+
+ pi2_src_copy = pi2_src;
+ pu1_dst_copy = pu1_dst;
+
+/* outer for loop starts from here */
+ for(col = 0; col < wd; col += 4)
+ {
+
+ pi2_src = pi2_src_copy + col;
+ pu1_dst = pu1_dst_copy + col;
+
+ /*load 4 pixel values */
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
+
+ /*load 4 pixel values */
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
+
+ s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+
+ s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+ /*load 4 pixel values */
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+ /*load 4 pixel values */
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+ s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+ s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+ /*load 4 pixel values */
+ s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+ /*load 4 pixel values */
+ s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+ s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
+
+ s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values */
+ s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+ /*load 4 pixel values */
+ s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
+
+ s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
+
+ s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s9_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+ s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+ s5_8x16b = _mm_and_si128(s4_8x16b, mask_low_32b);
+ s6_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b);
+ s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+ /* ROW 2*/
+ s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values */
+ s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
+
+ /*load 4 pixel values */
+ s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
+
+ s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
+
+ s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi32(s29_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+ s25_8x16b = _mm_and_si128(s24_8x16b, mask_low_32b);
+ s26_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b);
+ s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+ /*ROW 1*/
+ s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+ s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+ s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
+
+ s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+ s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
+
+ s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
+
+ s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi32(s19_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+ s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (dst_strd)));
+ s15_8x16b = _mm_and_si128(s14_8x16b, mask_low_32b);
+ s16_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b);
+ s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
+
+
+ /* ROW 3*/
+ s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values */
+ s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
+
+ s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
+
+ s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi32(s39_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+ s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+ s35_8x16b = _mm_and_si128(s34_8x16b, mask_low_32b);
+ s36_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b);
+ s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+ pi2_src += (8 * src_strd);
+ pu1_dst += (4 * dst_strd);
+
+ for(row = 4; row < ht; row += 4)
+ {
+
+ s3_0_16x8b = s3_2_16x8b;
+ s3_1_16x8b = s3_3_16x8b;
+ s3_2_16x8b = s3_4_16x8b;
+
+ s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+ s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+ s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values from (cur_row + 4)th row*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
+
+ s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
+ s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_0_16x8b = s4_2_16x8b;
+ s4_1_16x8b = s4_3_16x8b;
+ s4_2_16x8b = s4_4_16x8b;
+
+ s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s9_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
+
+ s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+ s5_8x16b = _mm_and_si128(s4_8x16b, mask_low_32b);
+ s6_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b);
+ s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+/* row + 2*/
+ s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values from (cur_row + 5)th row*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+
+ /*load 4 pixel values from (cur_row + 6)th row*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+ /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+ s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+ s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi32(s29_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
+
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
+ s25_8x16b = _mm_and_si128(s24_8x16b, mask_low_32b);
+ s26_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b);
+ s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+2)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
+
+
+/*row + 1*/
+ s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+ s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+ s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+ s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+ s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi32(s19_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
+
+ s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ s15_8x16b = _mm_and_si128(s14_8x16b, mask_low_32b);
+ s16_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b);
+ s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row + 1)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
+
+
+/* row + 3*/
+ s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values from (cur_row + 7)th row*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+ /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+ s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+ s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi32(s39_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
+
+ s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
+ s35_8x16b = _mm_and_si128(s34_8x16b, mask_low_32b);
+ s36_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b);
+ s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+3)*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
+
+ s2_10_16x8b = s2_3_16x8b;
+
+ pi2_src += 4 * src_strd; /* pointer update */
+ pu1_dst += 4 * dst_strd; /* pointer update */
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Luma prediction filter for vertical 16bit input & output
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 6 and
+* 8192 is subtracted to store it as a 16 bit number The output is used as
+* a input to weighted prediction
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD16 *pi2_src_copy;
+ WORD16 *pi2_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
+ __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
+ __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
+ __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
+ __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
+ __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
+ __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
+ __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
+
+ __m128i zero_8x16b, offset_8x16b, sign_reg;
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ zero_8x16b = _mm_setzero_si128();
+ sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+ s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+ coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */
+
+ coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2)); /* pi1_coeff[4] */
+ coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3)); /* pi1_coeff[4] */
+
+
+/* seting values in register */
+ offset_8x16b = _mm_set1_epi32(OFFSET14); /* for offset addition */
+
+ pi2_src_copy = pi2_src;
+ pi2_dst_copy = pi2_dst;
+
+/* outer for loop starts from here */
+ for(col = 0; col < wd; col += 4)
+ {
+
+ pi2_src = pi2_src_copy + col;
+ pi2_dst = pi2_dst_copy + col;
+
+ /*load 4 pixel values*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
+
+ /*load 4 pixel values*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
+
+ s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+
+ s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+
+ /*load 4 pixel values*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+ /*load 4 pixel values*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+ s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+ s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+
+ /*load 4 pixel values*/
+ s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+ /*load 4 pixel values*/
+ s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+ s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
+
+ s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values*/
+ s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+ /*load 4 pixel values*/
+ s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
+
+ s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
+
+ s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
+
+ s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 0*/
+ _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
+
+ /* ROW 2*/
+ s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values*/
+ s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
+
+ /*load 4 pixel values*/
+ s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
+
+ s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
+
+ s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
+
+ s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
+
+
+ /*ROW 1*/
+ s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+ s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+
+ s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
+
+ s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+
+ s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
+
+ s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
+
+ s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
+
+ s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 1*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s18_8x16b);
+
+
+ /* ROW 3*/
+ s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values*/
+ s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
+
+ s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
+
+ s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
+
+ s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 2*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
+
+ pi2_src += (8 * src_strd);
+ pi2_dst += (4 * dst_strd);
+
+ for(row = 4; row < ht; row += 4)
+ {
+
+ s3_0_16x8b = s3_2_16x8b;
+ s3_1_16x8b = s3_3_16x8b;
+ s3_2_16x8b = s3_4_16x8b;
+
+ s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
+ s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
+ s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values from (cur_row + 4)th row*/
+ s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
+
+ s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
+ s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
+
+ s4_0_16x8b = s4_2_16x8b;
+ s4_1_16x8b = s4_3_16x8b;
+ s4_2_16x8b = s4_4_16x8b;
+
+ s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
+ s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
+ s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
+
+ s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of row 4*/
+ _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
+
+/* row + 2*/
+ s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
+ s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
+ s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values from (cur_row + 5)th row*/
+ s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+
+ /*load 4 pixel values from (cur_row + 6)th row*/
+ s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+ /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
+ s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
+
+ s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
+
+ s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
+ s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
+ s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
+
+ s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+2)*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
+
+
+/*row + 1*/
+ s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
+ s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
+ s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
+
+ /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
+ s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
+ s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
+
+ s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
+ s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
+ s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
+
+ s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row + 1)*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s18_8x16b);
+
+
+/* row + 3*/
+ s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
+ s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
+ s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
+
+ /*load 4 pixel values from (cur_row + 7)th row*/
+ s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+ /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
+ s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
+
+ s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
+
+ s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
+ s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
+
+ s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
+
+ /* store 8 8-bit output values */
+ /* Store the output pixels of (cur_row+3)*/
+ _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
+
+ s2_10_16x8b = s2_3_16x8b;
+
+ pi2_src += 4 * src_strd; /* pointer update */
+ pi2_dst += 4 * dst_strd; /* pointer update */
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma interprediction filter for copy
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ __m128i s3, mask_4x32b;
+ UNUSED(pi1_coeff);
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ mask_4x32b = _mm_set_epi32(0, 0, 0, 0x80808080); /* Mask register */
+
+/* for loop starts from here */
+ if(wd % 8 == 0)
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+ for(col = 0; col < 2 * wd; col += 16)
+ {
+/* row =0 */
+
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
+ /* storing 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
+
+/* row =1 */
+ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
+ /* storing 8 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]*/
+
+ offset += 16; /*To pointer update */
+ } /* inner for loop ends here(16-output values in single iteration) */
+
+ pu1_src += 2 * src_strd; /* pointer update */
+ pu1_dst += 2 * dst_strd; /* pointer update */
+ }
+ }
+ else if(wd % 4 == 0)
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+/* row =0 */
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
+ /* storing 8 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
+/* row =1 */
+ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
+ /* storing 8 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
+
+ offset += 8; /* To pointer update */
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 2 * src_strd; /* pointer update */
+ pu1_dst += 2 * dst_strd; /* pointer update */
+ }
+ }
+ else
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+/* row =0 */
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+ /* storing four 8-bit output values */
+ _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + offset)); /* pu1_dst[col] = pu1_src[col]; */
+/* row =1 */
+ /* pu1_src[col] */
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+
+ /* storing four 8-bit output values */
+ _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + dst_strd + offset)); /* pu1_dst[col] = pu1_src[col]; */
+
+ offset += 4; /* To pointer update */
+ } /* inner for loop ends here(4-output values in single iteration) */
+
+ pu1_src += 2 * src_strd; /* pointer increment */
+ pu1_dst += 2 * dst_strd; /* pointer increment */
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma interprediction filter for horizontal input
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_horz_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+ __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+ __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+ __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+ __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b;
+
+ PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+
+/* loading four 8-bit coefficients */
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+ mask_low_32b = _mm_cmpeq_epi16(offset_8x16b, offset_8x16b);
+ mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
+ mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+/* outer for loop starts from here */
+ if(wd % 2 == 0 && wd % 4 != 0)
+ {
+
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+
+
+ /*load 16 pixel values from row 0*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*load 16 pixel values from row 1*/
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*Derive the source pixels for processing the 2nd pixel*/
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel*/
+ src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel*/
+ src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+ /*Derive the source pixels for processing the 2nd pixel*/
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel*/
+ src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+ /*Derive the source pixels for processing the 4th pixel*/
+ src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+ res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
+ res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
+ res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
+ res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+ res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b); /* row = 0 */
+ res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp13_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b); /* row = 0 */
+
+ res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 4);
+
+ res_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
+ res_temp5_8x16b = _mm_and_si128(res_temp4_8x16b, mask_low_32b);
+ res_temp6_8x16b = _mm_and_si128(res_temp13_8x16b, mask_high_96b);
+ res_temp7_8x16b = _mm_or_si128(res_temp5_8x16b, res_temp6_8x16b);
+
+ /* store 4 16-bit values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp7_8x16b); /* pu1_dst[col] = i2_tmp_u */
+
+ res_temp14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
+ res_temp15_8x16b = _mm_and_si128(res_temp14_8x16b, mask_low_32b);
+ res_temp16_8x16b = _mm_and_si128(res_temp3_8x16b, mask_high_96b);
+ res_temp17_8x16b = _mm_or_si128(res_temp15_8x16b, res_temp16_8x16b);
+
+ /* store 4 16-bit values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp17_8x16b); /* pu1_dst[col] = i2_tmp_u */
+
+
+ offset += 4; /* To pointer update*/
+
+ } /* inner loop ends here(8- output values in single iteration)*/
+
+ pu1_src += 2 * src_strd; /*pointer update*/
+ pu1_dst += 2 * dst_strd; /*pointer update*/
+ }
+ }
+ else
+ {
+
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+
+ /*load 16 pixel values from row 0*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*load 16 pixel values from row 1*/
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*Derive the source pixels for processing the 2nd pixel*/
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel*/
+ src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel*/
+ src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+
+ res_temp4_8x16b = _mm_adds_epi16(res_temp3_8x16b, offset_8x16b); /* row = 0 */
+ res_temp5_8x16b = _mm_srai_epi16(res_temp4_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp6_8x16b = _mm_packus_epi16(res_temp5_8x16b, res_temp5_8x16b); /* row = 0 */
+
+ /* store 4 16-bit values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp6_8x16b); /* pi2_dst[col] = i2_tmp_u */
+
+ /*Derive the source pixels for processing the 2nd pixel of row 1*/
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel of row 1*/
+ src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel of row 1*/
+ src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+ res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b); /* row = 0 */
+ res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
+ res_temp16_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b); /* row = 0 */
+
+ /* store 4 16-bit values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp16_8x16b); /* pu1_dst[col] = i2_tmp_u */
+
+
+ offset += 8; /* To pointer update*/
+
+ } /* inner loop ends here(8- output values in single iteration)*/
+
+ pu1_src += 2 * src_strd; /*pointer update*/
+ pu1_dst += 2 * dst_strd; /*pointer update*/
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Chroma interprediction filter for vertical input
+*
+* @par Description:
+* Applies a vertcal filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_ssse3(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_copy;
+ UWORD8 *pu1_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+ __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+ __m128i control_mask_1_8x16b, control_mask_2_8x16b;
+ __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+ __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
+ __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+ __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+ PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+
+/* seting values in register */
+ zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
+ offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+ mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+ mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+/* outer for loop starts from here */
+ if(wd % 8 == 0)
+ { /* wd = multiple of 8 case */
+
+ pu1_src_copy = pu1_src;
+ pu1_dst_copy = pu1_dst;
+
+ for(col = 0; col < 2 * wd; col += 16)
+ {
+
+ pu1_src = pu1_src_copy + col;
+ pu1_dst = pu1_dst_copy + col;
+
+
+ for(row = 0; row < ht; row += 2)
+ {
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
+ s21_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
+ s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
+
+
+ /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
+ s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
+ s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
+
+ s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
+
+ s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+ s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s32_8x16b = _mm_srai_epi16(s31_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+ s33_8x16b = _mm_packus_epi16(s32_8x16b, zero_8x16b);
+
+ s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storeu_si128((__m128i *)(pu1_dst), s7_8x16b);
+
+
+#if 1
+ s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
+
+ s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
+
+ s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+ s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s32_8x16b = _mm_srai_epi16(s31_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+ s33_8x16b = _mm_packus_epi16(s32_8x16b, zero_8x16b);
+
+ s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
+#endif
+
+ pu1_src += 2 * src_strd;
+ pu1_dst += 2 * dst_strd;
+
+
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ }
+ }
+ else if(wd % 4 == 0)
+ { /* wd = multiple of 8 case */
+
+ for(row = 0; row < ht; row += 2)
+ {
+ pu1_src_copy = pu1_src;
+ pu1_dst_copy = pu1_dst;
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
+ s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
+ s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+ s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
+ s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+ s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
+
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+ s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+ s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
+
+ pu1_src += 8; /* To pointer update */
+ pu1_dst += 8;
+
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+ pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
+ }
+ }
+
+ else
+ { /* wd = multiple of 4 case */
+
+ for(row = 0; row < ht; row += 2)
+ {
+ pu1_src_copy = pu1_src;
+ pu1_dst_copy = pu1_dst;
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
+ s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
+ s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+ s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
+ s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+ s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+ s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+ s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b);
+ s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b);
+ s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+ s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+ s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
+
+ s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b);
+ s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b);
+ s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
+
+ pu1_src += 4; /* To pointer update */
+ pu1_dst += 4;
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+ pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* chroma interprediction filter for copying 16bit output
+*
+* @par Description:
+* Copies the array of width 'wd' and height 'ht' from the location pointed
+* by 'src' to the location pointed by 'dst' The output is upshifted by 6
+* bits and is used as input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_inter_pred_chroma_copy_w16out_ssse3(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ __m128i s3, zero_8x16b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ UNUSED(pi1_coeff);
+ zero_8x16b = _mm_setzero_si128();
+/* outer for loop starts from here */
+ if(wd == 2) /* for wd =2 */
+ {
+ for(row = 0; row < ht; row += 2)
+ {
+ int offset = 0;
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+/* row =0 */
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
+
+/* row =1 */
+ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
+ offset += 4; /* To pointer update */
+ } /* inner for loop ends here */
+
+ pu1_src += 2 * src_strd; /* pointer update */
+ pi2_dst += 2 * dst_strd; /* pointer update */
+ }
+ }
+ else if(wd % 2 == 0 && wd % 4 != 0)
+ {
+ for(row = 0; row < ht / 2; row++)
+ {
+ int offset = 0;
+ int count = (2 * wd) / 8;
+ for(col = 0; col < count; col++)
+ {
+/* row =0 */
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+ _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
+
+ /*row=1*/ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+ _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+ offset += 8; /* To pointer update*/
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+/* finding last four values */
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
+
+ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+ pu1_src += 2 * src_strd; /* pointer update */
+ pi2_dst += 2 * dst_strd;
+ }
+ }
+ else
+ {
+ for(row = 0; row < ht / 2; row++)
+ {
+ int offset = 0;
+ for(col = 0; col < 2 * wd / 8; col++)
+ {
+/* row =0 */
+ /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
+ _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
+
+ /*row=1*/ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
+ s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
+ s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
+
+ s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
+
+ offset += 8; /* To pointer update*/
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src += 2 * src_strd; /* pointer update */
+ pi2_dst += 2 * dst_strd;
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* chroma interprediction filter to store horizontal 16bit ouput
+*
+* @par Description:
+* Applies a horizontal filter with coefficients pointed to by 'pi1_coeff'
+* to the elements pointed by 'pu1_src' and writes to the location pointed
+* by 'pu1_dst' No downshifting or clipping is done and the output is used
+* as an input for vertical filtering or weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_horz_w16out_ssse3(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, all_zero;
+ __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
+ __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
+ __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+ __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b;
+
+ PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+
+/* loading four 8-bit coefficients and convert 8-bit into 16-bit */
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ all_zero = _mm_setzero_si128();
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+/* outer for loop starts from here */
+ if(wd % 2 == 0 && wd % 4 != 0)
+ {
+ int offset = 0;
+ for(row = ht; row >= 2; row -= 2)
+ {
+ offset = 0;
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+
+ /*load 16 pixel values of row 0*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*load 16 pixel values of row 1*/
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*Derive the source pixels for processing the 2nd pixel of row 0*/
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel of row 0*/
+ src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel of row 0*/
+ src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+ /*Derive the source pixels for processing the 2nd pixel of row 1*/
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel of row 1*/
+ src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel of row 1*/
+ src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+ res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
+ res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
+ res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
+ res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+ res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 8);
+
+ /* store 4 16-bit values */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u */
+
+
+
+ /* store 4 16-bit values */
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u */
+
+
+ offset += 4; /* To pointer update*/
+
+ } /* inner loop ends here(8- output values in single iteration)*/
+
+ pu1_src += 2 * src_strd; /*pointer update*/
+ pi2_dst += 2 * dst_strd; /*pointer update*/
+ }
+
+ /*Epilogue to handle ht= odd case*/
+ if(row)
+ {
+ offset = 0;
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+
+ /*load 16 pixel values of row 0*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*Derive the source pixels for processing the 2nd pixel of row 0*/
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel of row 0*/
+ src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel of row 0*/
+ src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+ res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, all_zero);
+ res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, all_zero);
+ res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
+ res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+ //res_temp3_8x16b = _mm_srli_si128 (res_temp13_8x16b, 8);
+
+ /* store 4 16-bit values */
+ _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u */
+
+ offset += 4; /* To pointer update*/
+
+ }
+ }
+
+ }
+ else
+ {
+ int offset = 0;
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ offset = 0;
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+
+ /*load 16 pixel values of row 0*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*load 16 pixel values of row 1*/
+ src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*Derive the source pixels for processing the 2nd pixel of row 0*/
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel of row 0*/
+ src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel of row 0*/
+ src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+
+ /* store 8 16-bit values */
+ _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u */
+
+ /*Derive the source pixels for processing the 2nd pixel of row 1*/
+ src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
+
+ src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel of row 1*/
+ src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel of row 1*/
+ src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
+
+ src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
+
+ res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
+ res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
+
+ /* store 8 16-bit values */
+ _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u */
+
+
+ offset += 8; /* To pointer update*/
+
+ } /* inner loop ends here(8- output values in single iteration)*/
+
+ pu1_src += 2 * src_strd; /*pointer update*/
+ pi2_dst += 2 * dst_strd; /*pointer update*/
+ }
+
+ /*Epilogue to take care of odd ht*/
+ if(row)
+ {
+ offset = 0;
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+
+ /*load 16 pixel values of row 0*/
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
+
+ /*Derive the source pixels for processing the 2nd pixel of row 0*/
+ src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+
+ src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
+
+ /*Derive the source pixels for processing the 3rd pixel of row 0*/
+ src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
+
+ /*Derive the source pixels for processing the 4th pixel of row 0*/
+ src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
+
+ src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
+
+ res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
+ res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
+
+ /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
+ res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+
+ /* store 8 16-bit values */
+ _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u */
+
+ offset += 8; /* To pointer update*/
+
+ }
+ }
+
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Interprediction chroma filter to store vertical 16bit ouput
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' No downshifting or clipping is done and the output is used as
+* an input for weighted prediction
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_w16out_ssse3(UWORD8 *pu1_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_copy;
+ WORD16 *pi2_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+ __m128i s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b;
+ __m128i control_mask_1_8x16b, control_mask_2_8x16b;
+ __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+ __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+ __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+
+ PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
+ control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
+
+ coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */
+
+
+
+/* outer for loop starts from here */
+ if(wd % 8 == 0)
+ { /* wd = multiple of 8 case */
+
+ pu1_src_copy = pu1_src;
+ pi2_dst_copy = pi2_dst;
+
+ for(col = 0; col < 2 * wd; col += 16)
+ {
+
+ pu1_src = pu1_src_copy + col;
+ pi2_dst = pi2_dst_copy + col;
+
+
+ for(row = 0; row < ht; row += 2)
+ {
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ /*load 16 pixel values */
+ s21_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 16 pixel values */
+ s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
+
+
+ /*load 16 pixel values */
+ s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 16 pixel values */
+ s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
+
+ s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
+
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pi2_dst + 8), s35_8x16b);
+
+
+#if 1
+ s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
+
+ s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
+
+ s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
+
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd + 8), s35_8x16b);
+
+#endif
+
+ pu1_src += 2 * src_strd;
+ pi2_dst += 2 * dst_strd;
+
+
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ }
+ }
+
+ else if(wd % 4 == 0)
+ { /* wd = multiple of 8 case */
+
+ for(row = 0; row < ht; row += 2)
+ {
+
+ pu1_src_copy = pu1_src;
+ pi2_dst_copy = pi2_dst;
+
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ /*load 8 pixel values */
+ s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values */
+ s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+ s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ /*load 8 pixel values */
+ s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values */
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+ s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
+
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+ s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+ s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
+
+ pu1_src += 8; /* To pointer update */
+ pi2_dst += 8;
+
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+ pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
+ }
+ }
+
+ else
+ { /* wd = multiple of 4 case */
+
+ for(row = 0; row < ht; row += 2)
+ {
+ pu1_src_copy = pu1_src;
+ pi2_dst_copy = pi2_dst;
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+
+#if 1
+ PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
+
+#endif
+
+ /*load 8 pixel values */
+ s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
+
+ /*load 8 pixel values */
+ s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
+
+ s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ /*load 8 pixel values */
+ s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
+
+ /*load 8 pixel values */
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
+
+ s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
+
+ s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
+
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
+ s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
+ s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
+
+ pu1_src += 4; /* To pointer update */
+ pi2_dst += 4;
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
+ pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* chroma interprediction filter for vertical 16bit input
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 12 and
+* clipped to lie between 0 and 255
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_w16inp_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD16 *pi2_src_copy;
+ UWORD8 *pu1_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+ __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+ __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+ __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
+ __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+ __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ zero_8x16b = _mm_setzero_si128();
+ sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+ s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+ coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */
+
+/* seting values in register */
+ offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
+ mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+ mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+
+/* outer for loop starts from here */
+ if(wd % 4 == 0)
+ { /* wd = multiple of 8 case */
+
+ pi2_src_copy = pi2_src;
+ pu1_dst_copy = pu1_dst;
+
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+
+ pi2_src = pi2_src_copy + col;
+ pu1_dst = pu1_dst_copy + col;
+
+
+ for(row = 0; row < ht; row += 2)
+ {
+
+ /*load 16 pixel values */
+ s21_8x16b = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
+
+ /*load 16 pixel values */
+ s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
+
+
+ /*load 16 pixel values */
+ s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
+
+ /*load 16 pixel values */
+ s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
+
+ s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s34_8x16b = _mm_srai_epi32(s33_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+ s33_8x16b = _mm_packus_epi16(s35_8x16b, zero_8x16b);
+
+ s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
+
+
+#if 1
+ s25_8x16b = _mm_load_si128((__m128i *)(pi2_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
+
+ s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
+
+ s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s34_8x16b = _mm_srai_epi32(s33_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+ s33_8x16b = _mm_packus_epi16(s35_8x16b, zero_8x16b);
+
+ s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
+#endif
+
+ pi2_src += 2 * src_strd;
+ pu1_dst += 2 * dst_strd;
+
+
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ }
+ }
+ else
+ { /* wd = multiple of 4 case */
+
+ for(row = 0; row < ht; row += 2)
+ {
+ pi2_src_copy = pi2_src;
+ pu1_dst_copy = pu1_dst;
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+
+ /*load 8 pixel values */
+ s21_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+ /*load 8 pixel values */
+ s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+ s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ /*load 8 pixel values */
+ s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+ /*load 8 pixel values */
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+ s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+ s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+ s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
+ s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b);
+ s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b);
+ s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
+
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+ s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+ s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
+ s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
+
+ /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
+
+ /* i2_tmp = CLIP_U8(i2_tmp);*/
+ s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
+
+ s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b);
+ s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b);
+ s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
+
+/* store 8 8-bit output values */
+ /* pu1_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
+
+ pi2_src += 4; /* To pointer update */
+ pu1_dst += 4;
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
+ pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+*
+* Chroma interprediction filter for 16bit vertical input and output.
+*
+* @par Description:
+* Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
+* the elements pointed by 'pu1_src' and writes to the location pointed by
+* 'pu1_dst' Input is 16 bits The filter output is downshifted by 6 and
+* 8192 is subtracted to store it as a 16 bit number The output is used as
+* a input to weighted prediction
+*
+* @param[in] pi2_src
+* WORD16 pointer to the source
+*
+* @param[out] pi2_dst
+* WORD16 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] pi1_coeff
+* WORD8 pointer to the filter coefficients
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
+ WORD16 *pi2_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD8 *pi1_coeff,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col;
+ WORD16 *pi2_src_copy;
+ WORD16 *pi2_dst_copy;
+ __m128i coeff0_1_8x16b, coeff2_3_8x16b;
+ __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
+ __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
+ __m128i zero_8x16b, sign_reg;
+ __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
+ __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
+
+
+/* load 8 8-bit coefficients and convert 8-bit into 16-bit */
+ s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
+
+ zero_8x16b = _mm_setzero_si128();
+ sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
+ s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
+
+ coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */
+ coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */
+
+
+/* outer for loop starts from here */
+ if(wd % 4 == 0)
+ { /* wd = multiple of 8 case */
+
+ pi2_src_copy = pi2_src;
+ pi2_dst_copy = pi2_dst;
+
+ for(col = 0; col < 2 * wd; col += 8)
+ {
+
+ pi2_src = pi2_src_copy + col;
+ pi2_dst = pi2_dst_copy + col;
+
+
+ for(row = 0; row < ht; row += 2)
+ {
+
+ /*load 16 pixel values */
+ s21_8x16b = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
+
+ /*load 16 pixel values */
+ s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
+
+
+ /*load 16 pixel values */
+ s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
+
+ /*load 16 pixel values */
+ s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
+
+ s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+ s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
+
+ s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_store_si128((__m128i *)(pi2_dst), s7_8x16b);
+
+
+#if 1
+ s25_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+
+ s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
+
+ s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+
+ s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
+
+ s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+ s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
+
+ s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s7_8x16b);
+#endif
+
+ pi2_src += 2 * src_strd;
+ pi2_dst += 2 * dst_strd;
+
+
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ }
+ }
+ else
+ { /* wd = multiple of 4 case */
+
+ for(row = 0; row < ht; row += 2)
+ {
+ pi2_src_copy = pi2_src;
+ pi2_dst_copy = pi2_dst;
+ for(col = 0; col < 2 * wd; col += 4)
+ {
+
+ /*load 4 pixel values */
+ s21_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
+
+ /*load 4 pixel values */
+ s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
+
+ s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ /*load 4 pixel values */
+ s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
+
+ /*load 4 pixel values */
+ s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
+
+ s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
+
+ s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pi2_dst), s9_8x16b);
+
+ s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
+
+ s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
+ s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
+
+ s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
+ s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
+
+ s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
+
+ /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
+ s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH);
+
+ s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
+
+/* store 8 8-bit output values */
+ /* pi2_dst[col] = (UWORD8)i2_tmp; */
+ _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s9_8x16b);
+
+ pi2_src += 4; /* To pointer update */
+ pi2_dst += 4;
+ } /* inner for loop ends here(8-output values in single iteration) */
+
+ pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
+ pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
+ }
+ }
+
+}
diff --git a/common/x86/ihevc_intra_pred_filters_sse42_intr.c b/common/x86/ihevc_intra_pred_filters_sse42_intr.c
new file mode 100644
index 0000000..6488de6
--- /dev/null
+++ b/common/x86/ihevc_intra_pred_filters_sse42_intr.c
@@ -0,0 +1,4201 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_intra_pred_filters_x86_intr.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ihevc_intra_pred_ref_filtering_sse42()
+* - ihevc_intra_pred_luma_dc_sse42()
+* - ihevc_intra_pred_luma_horz_sse42()
+* - ihevc_intra_pred_luma_ver_sse42()
+* - ihevc_intra_pred_luma_mode_3_to_9_sse42()
+* - ihevc_intra_pred_luma_mode_11_to_17_sse42()
+* - ihevc_intra_pred_luma_mode_19_to_25_sse42()
+* - ihevc_intra_pred_luma_mode_27_to_33_sse42()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdlib.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <immintrin.h>
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+/*****************************************************************************/
+/* global tables Definition */
+/*****************************************************************************/
+
+
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for ref_filtering
+*
+*
+* @par Description:
+* Reference DC filtering for neighboring samples dependent on TU size and
+* mode Refer to section 8.4.4.2.3 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_ref_filtering_sse42(UWORD8 *pu1_src,
+ WORD32 nt,
+ UWORD8 *pu1_dst,
+ WORD32 mode,
+ WORD32 strong_intra_smoothing_enable_flag)
+{
+ WORD32 filter_flag;
+ WORD32 i; /* Generic indexing variable */
+ WORD32 four_nt = 4 * nt;
+ UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
+ WORD32 bi_linear_int_flag = 0;
+ WORD32 abs_cond_left_flag = 0;
+ WORD32 abs_cond_top_flag = 0;
+ WORD32 dc_val = 1 << (BIT_DEPTH - 5);
+ __m128i src_temp1, src_temp2, src_temp3, src_temp7;
+ __m128i src_temp4, src_temp5, src_temp6, src_temp8;
+
+ //WORD32 strong_intra_smoothing_enable_flag = 1;
+
+
+
+ filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+ if(0 == filter_flag)
+ {
+ if(pu1_src == pu1_dst)
+ {
+ return;
+ }
+ else
+ {
+ if(nt == 4)
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ pu1_dst[four_nt] = pu1_src[four_nt];
+
+ }
+
+ else if(nt == 8)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+
+ pu1_dst[four_nt] = pu1_src[four_nt];
+ }
+ else if(nt == 16)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ pu1_dst[four_nt] = pu1_src[four_nt];
+ }
+ else if(nt == 32)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+ pu1_dst[four_nt] = pu1_src[four_nt];
+ }
+
+ }
+ }
+
+ else
+ {
+ /* If strong intra smoothin is enabled and transform size is 32 */
+ if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+ {
+ /* Strong Intra Filtering */
+ abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt]
+ - (2 * pu1_src[3 * nt]))) < dc_val;
+ abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0]
+ - (2 * pu1_src[nt]))) < dc_val;
+
+ bi_linear_int_flag = ((1 == abs_cond_left_flag)
+ && (1 == abs_cond_top_flag));
+ }
+ /* Extremities Untouched*/
+ au1_flt[0] = pu1_src[0];
+ au1_flt[4 * nt] = pu1_src[4 * nt];
+
+ /* Strong filtering of reference samples */
+ if(1 == bi_linear_int_flag)
+ {
+ au1_flt[2 * nt] = pu1_src[2 * nt];
+
+ for(i = 1; i < (2 * nt); i++)
+ au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
+
+ for(i = 1; i < (2 * nt); i++)
+ au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
+ }
+ else
+ {
+ __m128i const_value_8x16;
+
+ const_value_8x16 = _mm_set1_epi16(2);
+
+ au1_flt[0] = pu1_src[0];
+ au1_flt[4 * nt] = pu1_src[4 * nt];
+
+ /* Perform bilinear filtering of Reference Samples */
+ for(i = 0; i < (four_nt); i += 16)
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i));
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp2, 1);
+
+ src_temp1 = _mm_cvtepu8_epi16(src_temp1);
+ src_temp2 = _mm_cvtepu8_epi16(src_temp2);
+ src_temp3 = _mm_cvtepu8_epi16(src_temp3);
+
+ src_temp2 = _mm_slli_epi16(src_temp2, 1);
+
+ src_temp1 = _mm_add_epi16(src_temp1, src_temp2);
+ src_temp1 = _mm_add_epi16(src_temp1, src_temp3);
+ src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16);
+
+ src_temp1 = _mm_srai_epi16(src_temp1, 2);
+
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i));
+ src_temp5 = _mm_srli_si128(src_temp4, 1);
+ src_temp6 = _mm_srli_si128(src_temp5, 1);
+
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+ src_temp5 = _mm_cvtepu8_epi16(src_temp5);
+ src_temp6 = _mm_cvtepu8_epi16(src_temp6);
+
+ src_temp5 = _mm_slli_epi16(src_temp5, 1);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16);
+
+ src_temp4 = _mm_srai_epi16(src_temp4, 2);
+
+ /* converting 16 bit to 8 bit */
+ src_temp1 = _mm_packus_epi16(src_temp1, src_temp4);
+
+ _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1);
+ }
+ au1_flt[4 * nt] = pu1_src[4 * nt];
+ }
+
+ if(nt == 4)
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+ else if(nt == 8)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+ else if(nt == 16)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+
+ else if(nt == 32)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64));
+ src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80));
+ src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96));
+ src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+
+ }
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_dc_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 acc_dc;
+ WORD32 dc_val, two_dc_val, three_dc_val;
+ WORD32 row;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+ __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6;
+ __m128i src_temp8, src_temp9, src_temp10, src_temp2;
+ __m128i m_zero = _mm_set1_epi32(0);
+ __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]);
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+
+ acc_dc = 0;
+ /* Calculate DC value for the transform block */
+
+
+
+ if(nt == 32)
+ {
+ __m128i temp;
+ WORD32 itr_count;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
+
+ src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+ src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+ src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
+ src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
+
+ src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+ src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+ /* computing acc_dc value */
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ two_dc_val = 2 * dc_val;
+ three_dc_val = 3 * dc_val;
+
+ temp = _mm_set1_epi8(dc_val);
+
+ for(itr_count = 0; itr_count < 2; itr_count++)
+ {
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp);
+
+ pu1_dst += 16 * dst_strd;
+ }
+ }
+
+ else
+
+ {
+ __m128i zero_8x16b;
+ __m128i sm1 = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+ /* DC filtering for the first top row and first left column */
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+ if(nt == 4) /* nt multiple of 4*/
+ {
+ WORD32 temp1, temp2, temp3;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ src_temp4 = _mm_cvtepu8_epi16(src_temp3);
+ src_temp2 = _mm_cvtepu8_epi16(src_temp2);
+
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+/* computing acc_dc value */
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ three_dc_val = 3 * dc_val;
+
+ /* loding 8-bit 16 pixel */
+ src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+ two_dc_val = 2 * dc_val;
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+ src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
+ src_temp2 = _mm_srli_epi16(src_temp2, 2);
+
+ src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+
+ *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+ /* retore first value*/
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+
+ src_temp2 = _mm_insert_epi8(src_temp2, dc_val, 0);
+
+ src_temp2 = _mm_shuffle_epi8(src_temp2, sm1);
+ src_temp3 = _mm_shuffle_epi8(src_temp2, sm1);
+ src_temp4 = _mm_shuffle_epi8(src_temp2, sm1);
+
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(1 * dst_strd) + 0], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(2 * dst_strd) + 0], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(3 * dst_strd) + 0], 0);
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+ temp2 = _mm_cvtsi128_si32(src_temp3);
+ temp3 = _mm_cvtsi128_si32(src_temp4);
+
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+ }
+ else if(nt == 8) /* if nt%8==0*/
+ {
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+
+ src_temp4 = _mm_sad_epu8(src_temp3, m_zero);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+ /* computing acc_dc value */
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ three_dc_val = 3 * dc_val;
+ src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+ two_dc_val = 2 * dc_val;
+
+ /* loding 8-bit 16 pixel */
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp2 = _mm_cvtepu8_epi16(src_temp2);
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+ src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+ src_temp2 = _mm_srli_epi16(src_temp2, 2);
+ src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2);
+
+ /* retore first value*/
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+
+ /* Fill the remaining rows with DC value*/
+
+ src_temp1 = _mm_set1_epi8(dc_val);
+ src_temp2 = _mm_set1_epi8(dc_val);
+ src_temp3 = _mm_set1_epi8(dc_val);
+ src_temp4 = _mm_set1_epi8(dc_val);
+ src_temp5 = _mm_set1_epi8(dc_val);
+ src_temp6 = _mm_set1_epi8(dc_val);
+ src_temp7 = _mm_set1_epi8(dc_val);
+
+ src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+ src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+ src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+ src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+ }
+ else if(nt == 16) /* if nt%8==0*/
+ {
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+
+ src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+ src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+
+ src_temp2 = _mm_cvtepu8_epi16(src_temp2);
+ src_temp10 = _mm_cvtepu8_epi16(src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+ /* computing acc_dc value */
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ three_dc_val = 3 * dc_val;
+ src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+ two_dc_val = 2 * dc_val;
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+ src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+ src_temp10 = _mm_add_epi16(src_temp10, src_temp1);
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+ src_temp2 = _mm_srli_epi16(src_temp2, 2);
+ src_temp10 = _mm_srli_epi16(src_temp10, 2);
+
+ src_temp2 = _mm_packus_epi16(src_temp2, src_temp10);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
+
+ /* retore first value*/
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+ /* Fill the remaining rows with DC value*/
+ src_temp1 = _mm_set1_epi8(dc_val);
+ src_temp2 = _mm_set1_epi8(dc_val);
+ src_temp3 = _mm_set1_epi8(dc_val);
+ src_temp4 = _mm_set1_epi8(dc_val);
+ src_temp5 = _mm_set1_epi8(dc_val);
+ src_temp6 = _mm_set1_epi8(dc_val);
+ src_temp7 = _mm_set1_epi8(dc_val);
+
+ for(row = 1; row < nt; row += 8)
+ {
+
+ src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+ src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+ src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+ src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+ src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((8) * dst_strd)], 0);
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((9) * dst_strd)], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((10) * dst_strd)], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((11) * dst_strd)], 0);
+ src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((12) * dst_strd)], 0);
+ src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((13) * dst_strd)], 0);
+ src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((14) * dst_strd)], 0);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3);
+
+ src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((15) * dst_strd)], 0);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1);
+
+ }
+
+ }
+ else if(nt == 32) /* if nt%8==0*/
+ {
+
+ __m128i src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16, src_temp17;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
+
+ /* loding 8-bit 16 pixel */
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+ src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
+ src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 24));
+
+ src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+ src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+ src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
+ src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
+
+ src_temp2 = _mm_cvtepu8_epi16(src_temp2);
+ src_temp6 = _mm_cvtepu8_epi16(src_temp6);
+ src_temp9 = _mm_cvtepu8_epi16(src_temp9);
+ src_temp10 = _mm_cvtepu8_epi16(src_temp10);
+
+ src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+ src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+ /* computing acc_dc value */
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ three_dc_val = 3 * dc_val;
+ src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+ two_dc_val = 2 * dc_val;
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+ src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+ src_temp2 = _mm_add_epi16(src_temp6, src_temp1);
+ src_temp2 = _mm_add_epi16(src_temp9, src_temp1);
+ src_temp2 = _mm_add_epi16(src_temp10, src_temp1);
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+ src_temp2 = _mm_srli_epi16(src_temp2, 2);
+ src_temp6 = _mm_srli_epi16(src_temp6, 2);
+ src_temp9 = _mm_srli_epi16(src_temp9, 2);
+ src_temp10 = _mm_srli_epi16(src_temp10, 2);
+
+ src_temp2 = _mm_packus_epi16(src_temp2, src_temp6);
+ src_temp10 = _mm_packus_epi16(src_temp9, src_temp10);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp10);
+
+ /* retore first value*/
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+ /* Fill the remaining rows with DC value*/
+ src_temp1 = _mm_insert_epi8(src_temp1, dc_val, 0);
+
+ src_temp2 = src_temp1;
+ src_temp3 = src_temp1;
+ src_temp4 = src_temp1;
+ src_temp5 = src_temp1;
+ src_temp6 = src_temp1;
+ src_temp7 = src_temp1;
+
+ src_temp12 = src_temp1;
+ src_temp13 = src_temp1;
+ src_temp14 = src_temp1;
+ src_temp15 = src_temp1;
+ src_temp16 = src_temp1;
+ src_temp17 = src_temp1;
+ src_temp11 = src_temp1;
+
+ for(row = 1; row < nt; row++)
+ {
+ src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+ src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+ src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+ src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), src_temp12);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), src_temp13);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp16);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd) + 16), src_temp17);
+
+
+ }
+
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for horizontal luma variable.
+*
+* @par Description:
+* Horizontal intraprediction(mode 10) with reference samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_horz_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row;
+ WORD32 two_nt;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ two_nt = 2 * nt;
+
+
+ if(nt == 32)
+ {
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+ __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+ __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+ for(row = 0; row < nt; row += 16)
+ {
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
+
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp1, 2);
+ src_temp4 = _mm_srli_si128(src_temp1, 3);
+ src_temp5 = _mm_srli_si128(src_temp1, 4);
+ src_temp6 = _mm_srli_si128(src_temp1, 5);
+ src_temp7 = _mm_srli_si128(src_temp1, 6);
+ src_temp8 = _mm_srli_si128(src_temp1, 7);
+
+ src_temp9 = _mm_srli_si128(src_temp1, 8);
+ src_temp10 = _mm_srli_si128(src_temp1, 9);
+ src_temp11 = _mm_srli_si128(src_temp1, 10);
+ src_temp12 = _mm_srli_si128(src_temp1, 11);
+ src_temp13 = _mm_srli_si128(src_temp1, 12);
+ src_temp14 = _mm_srli_si128(src_temp1, 13);
+ src_temp15 = _mm_srli_si128(src_temp1, 14);
+ src_temp16 = _mm_srli_si128(src_temp1, 15);
+
+ src_temp8 = _mm_shuffle_epi8(src_temp8, sm);
+ src_temp7 = _mm_shuffle_epi8(src_temp7, sm);
+ src_temp6 = _mm_shuffle_epi8(src_temp6, sm);
+ src_temp5 = _mm_shuffle_epi8(src_temp5, sm);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp3 = _mm_shuffle_epi8(src_temp3, sm);
+ src_temp2 = _mm_shuffle_epi8(src_temp2, sm);
+ src_temp1 = _mm_shuffle_epi8(src_temp1, sm);
+
+ src_temp16 = _mm_shuffle_epi8(src_temp16, sm);
+ src_temp15 = _mm_shuffle_epi8(src_temp15, sm);
+ src_temp14 = _mm_shuffle_epi8(src_temp14, sm);
+ src_temp13 = _mm_shuffle_epi8(src_temp13, sm);
+ src_temp12 = _mm_shuffle_epi8(src_temp12, sm);
+ src_temp11 = _mm_shuffle_epi8(src_temp11, sm);
+ src_temp10 = _mm_shuffle_epi8(src_temp10, sm);
+ src_temp9 = _mm_shuffle_epi8(src_temp9, sm);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1);
+
+ }
+
+ }
+
+ }
+ else
+
+ {
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6;
+ __m128i src_temp10, zero_8x16b, src_temp7;
+
+ /* DC filtering for the first top row and first left column */
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+ /*Filtering done for the 1st row */
+
+ src_temp2 = _mm_set1_epi16(pu1_ref[two_nt - 1]);
+ src_temp10 = _mm_set1_epi16(pu1_ref[two_nt]);
+
+ /* loding 8-bit 16 pixels */
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+
+ /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
+ src_temp3 = _mm_sub_epi16(src_temp4, src_temp10);
+
+ /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+ src_temp3 = _mm_srai_epi16(src_temp3, 1);
+
+ /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+ src_temp3 = _mm_add_epi16(src_temp2, src_temp3);
+
+ if(nt == 4)
+ {
+ int temp1, temp2, temp3;
+ src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b);
+ temp1 = _mm_cvtsi128_si32(src_temp3);
+
+ *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+ temp2 = _mm_cvtsi128_si32(src_temp3);
+ temp3 = _mm_cvtsi128_si32(src_temp4);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+ }
+ else if(nt == 8)
+ {
+ src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b);
+
+
+ src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]);
+ src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]);
+ src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]);
+ src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7);
+
+ }
+ else if(nt == 16)
+ {
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+ src_temp4 = _mm_cvtepu8_epi16(src_temp4);
+
+ src_temp10 = _mm_sub_epi16(src_temp4, src_temp10);
+ src_temp10 = _mm_srai_epi16(src_temp10, 1);
+ src_temp10 = _mm_add_epi16(src_temp2, src_temp10);
+
+ src_temp3 = _mm_packus_epi16(src_temp3, src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]);
+ src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]);
+ src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]);
+ src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]);
+ src_temp10 = _mm_set1_epi8(pu1_ref[two_nt - 9]);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10);
+
+ src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 10]);
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 11]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 12]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 13]);
+ src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 14]);
+ src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 15]);
+ src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 16]);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7);
+
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+* Horizontal intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_ver_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row;
+ WORD16 s2_predpixel;
+ WORD32 two_nt = 2 * nt;
+ __m128i src_temp0, src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7;
+
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ if(nt == 32)
+ {
+ __m128i temp1, temp2;
+ WORD32 itr_count;
+
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
+
+ for(itr_count = 0; itr_count < 2; itr_count++)
+ {
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
+
+ pu1_dst += 16 * dst_strd;
+ }
+ }
+
+ else
+
+ {
+ /*Filtering done for the 1st column */
+ for(row = nt - 1; row >= 0; row--)
+ {
+ s2_predpixel = pu1_ref[two_nt + 1]
+ + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
+ pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
+ }
+
+ /* Replication to next columns*/
+
+ if(nt == 4)
+ {
+ int temp1, temp2, temp3, temp4;
+
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp3 = src_temp2;
+ src_temp4 = src_temp2;
+ src_temp5 = src_temp2;
+
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(0 * dst_strd)], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(1 * dst_strd)], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(2 * dst_strd)], 0);
+ src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[(3 * dst_strd)], 0);
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+ temp2 = _mm_cvtsi128_si32(src_temp3);
+ temp3 = _mm_cvtsi128_si32(src_temp4);
+ temp4 = _mm_cvtsi128_si32(src_temp5);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+ }
+ else if(nt == 8)
+ {
+
+ src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp1 = src_temp0;
+ src_temp2 = src_temp0;
+ src_temp3 = src_temp0;
+ src_temp4 = src_temp0;
+ src_temp5 = src_temp0;
+ src_temp6 = src_temp0;
+ src_temp7 = src_temp0;
+
+ src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((0) * dst_strd)], 0);
+ src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
+ src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
+ src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
+ src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+
+ }
+ else if(nt == 16)
+ {
+ for(row = 0; row < nt; row += 8)
+ {
+
+ src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp1 = src_temp0;
+ src_temp2 = src_temp0;
+ src_temp3 = src_temp0;
+ src_temp4 = src_temp0;
+ src_temp5 = src_temp0;
+ src_temp6 = src_temp0;
+ src_temp7 = src_temp0;
+
+ src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((row + 0) * dst_strd)], 0);
+ src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((row + 1) * dst_strd)], 0);
+ src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((row + 2) * dst_strd)], 0);
+ src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((row + 3) * dst_strd)], 0);
+ src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((row + 4) * dst_strd)], 0);
+ src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((row + 5) * dst_strd)], 0);
+ src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((row + 6) * dst_strd)], 0);
+ src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((row + 7) * dst_strd)], 0);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp7);
+
+ }
+
+ }
+
+
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 3 to mode 9
+*
+* @par Description:
+* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_3_to_9_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 two_nt = 2 * nt;
+ WORD32 intra_pred_ang;
+
+
+ __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i fract_4x32b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+ UNUSED(src_strd);
+
+
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ const_temp_4x32b = _mm_set1_epi16(16);
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+ const_temp4_4x32b = _mm_set1_epi32(4);
+
+ two_nt_4x32b = _mm_set1_epi32(two_nt - nt);
+
+
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+ if(nt == 4)
+ {
+
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ int temp11, temp21, temp31, temp41;
+ // WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b);
+ fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4);
+ src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12);
+
+ temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+ temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+ }
+
+ else if(nt == 16 || nt == 32)
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(two_nt);
+
+ for(col = 0; col < nt; col += 8)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+ fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 8)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
+ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
+ src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b); /* row=7*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b); /* row=3*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b); /* row=0*/
+
+ }
+ }
+ }
+ else
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(two_nt - nt);
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+ fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
+ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
+ src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b); /* row=3*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b); /* row=4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b); /* row=6*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b); /* row=7*/
+
+ }
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 11 to mode 17
+*
+* @par Description:
+* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
+* with reference neighboring samples location pointed by 'pu1_ref' to the
+* TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_11_to_17_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
+ /* for ref main & side samples assignment,can be combined for */
+ /* optimzation*/
+
+ WORD32 row, col, k;
+ WORD32 two_nt;
+ WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+ WORD32 ref_idx;
+
+ __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i fract_4x32b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+
+
+ UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2];
+ UWORD8 *ref_main;
+ UWORD8 *ref_temp;
+ UNUSED(src_strd);
+
+ inv_ang_sum = 128;
+ two_nt = 2 * nt;
+ ref_temp = ref_tmp + 1;
+ ref_main = ref_temp + nt - 1;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+ const_temp_4x32b = _mm_set1_epi16(16);
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+ const_temp4_4x32b = _mm_set1_epi32(4);
+
+ two_nt_4x32b = _mm_set1_epi32(1);
+
+
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+ if(nt == 4)
+ {
+
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ int temp11, temp21, temp31, temp41;
+// WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+
+ ref_main = ref_temp + nt - 1;
+ for(k = 0; k < nt + 1; k++)
+ ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
+
+ ref_main = ref_temp + nt - 1;
+ ref_idx = (nt * intra_pred_ang) >> 5;
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+ }
+
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b);
+ fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4);
+ src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8);
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12);
+
+ temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+ temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+ /* loding 8-bit 4 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+ }
+
+ else if(nt == 32)
+ {
+
+
+ __m128i temp1, temp2, temp3, temp11, temp12;
+ __m128i src_values0, src_values1;
+ /* Intermediate reference samples for negative angle modes */
+
+ ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+ temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
+ temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+
+ /* For negative angled derive the main reference samples from side */
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode]));
+ temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+ src_values1 = _mm_shuffle_epi8(src_values1, temp2);
+ src_values0 = _mm_shuffle_epi8(src_values0, temp12);
+ src_values1 = _mm_shuffle_epi8(src_values1, temp11);
+
+ temp1 = _mm_shuffle_epi8(temp1, temp2);
+ temp3 = _mm_shuffle_epi8(temp3, temp2);
+
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1);
+ _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0);
+ _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1);
+
+
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ for(col = 0; col < nt; col += 8)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+ fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 8)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/
+
+ }
+ }
+ }
+ else if(nt == 16)
+ {
+
+ __m128i temp1, temp2, temp11, src_values0;
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+ temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+ temp1 = _mm_shuffle_epi8(temp1, temp2);
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+ _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ for(col = 0; col < nt; col += 8)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+ fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 8)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/
+
+ }
+ }
+ }
+ else
+ {
+
+
+ __m128i temp1, temp2, temp11, src_values0;
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
+
+ /* For negative angled derive the main reference samples from side */
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+ temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+ temp1 = _mm_shuffle_epi8(temp1, temp2);
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+ src_values0 = _mm_srli_si128(src_values0, 8);
+
+ _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+ _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+ fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* row=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* row=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* row=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* row=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b); /* row=0*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b); /* row=3*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b); /* row=7*/
+
+ }
+ }
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 19 to mode 25
+*
+* @par Description:
+* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_19_to_25_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, k;
+ WORD32 two_nt, intra_pred_ang;
+ WORD32 inv_ang, inv_ang_sum;
+ //WORD32 ref_main_idx, pos, fract, idx;
+ WORD32 ref_idx;
+ UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2];
+ UWORD8 *ref_main, *ref_temp;
+
+ __m128i /*fract_8x16b,*/ const_temp_8x16b, sm3;
+ __m128i temp1, temp2, temp3, temp4;
+ __m128i temp11, temp12, temp13, temp14;
+ UNUSED(src_strd);
+
+ two_nt = 2 * nt;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp = ref_tmp + 1;
+ ref_main = ref_temp + nt - 1;
+
+
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+
+
+ const_temp_8x16b = _mm_set1_epi16(16);
+
+ if(nt == 32)
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+ WORD32 col = 0;
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+ temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19]));
+ temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+ src_values1 = _mm_shuffle_epi8(src_values1, temp12);
+
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3);
+ _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1);
+ _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0);
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+ for(col = 0; col < nt; col += 16)
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+
+ }
+ else if(nt == 16) /* for nt = 16 case */
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+ _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+ }
+ else if(nt == 8)
+ {
+
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+ temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
+
+ /* For negative angled derive the main reference samples from side */
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+ src_values0 = _mm_srli_si128(src_values0, 8);
+ _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+ _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+
+
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); /* col = 24-31 */
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); /* col = 32-39 */
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); /* col = 40-47 */
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); /* col = 48-55 */
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); /* col = 56-63*/
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+ src_values4 = _mm_packus_epi16(src_values4, src_values5);
+ src_values6 = _mm_packus_epi16(src_values6, src_values7);
+ src_values5 = _mm_srli_si128(src_values4, 8);
+ src_values7 = _mm_srli_si128(src_values6, 8);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/
+ }
+ }
+ else /* if nt =4*/
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ for(k = 0; k < (nt + 1); k++)
+ ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
+ ref_idx = (nt * intra_pred_ang) >> 5;
+ inv_ang_sum = 128;
+
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+ }
+
+
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+
+ two_nt_4x32b = _mm_set1_epi32(1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+ {
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ int temp11, temp21, temp31, temp41;
+
+
+ __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+
+ temp11 = _mm_cvtsi128_si32(src_values0);
+ temp21 = _mm_cvtsi128_si32(src_values1);
+ temp31 = _mm_cvtsi128_si32(src_values2);
+ temp41 = _mm_cvtsi128_si32(src_values3);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 27 to mode 33
+*
+* @par Description:
+* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_27_to_33_sse42(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row;
+ WORD32 two_nt;
+ WORD32 intra_pred_ang;
+
+ __m128i temp11, temp12, temp13, temp14;
+
+ __m128i const_temp_8x16b;
+ __m128i temp1, temp2, temp3, temp4, sm3;
+ UNUSED(src_strd);
+
+ two_nt = 2 * nt;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ const_temp_8x16b = _mm_set1_epi16(16);
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+ if(nt == 32)
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+ int col = 0;
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+ for(col = 0; col < nt; col += 16)
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+
+ }
+ else if(nt == 16) /* for nt = 16 case */
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+
+ }
+ else if(nt == 8)
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+
+ two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ //for(row = 0; row < nt; row +=4)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); /* col = 24-31 */
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); /* col = 32-39 */
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); /* col = 40-47 */
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); /* col = 48-55 */
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); /* col = 56-63*/
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+ src_values4 = _mm_packus_epi16(src_values4, src_values5);
+ src_values6 = _mm_packus_epi16(src_values6, src_values7);
+ src_values5 = _mm_srli_si128(src_values4, 8);
+ src_values7 = _mm_srli_si128(src_values6, 8);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/
+ }
+
+ }
+ else /* if nt =4*/
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+
+ two_nt_4x32b = _mm_set1_epi32(two_nt + 1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+ {
+ int temp11, temp21, temp31, temp41;
+
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+
+ __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+
+ temp11 = _mm_cvtsi128_si32(src_values0);
+ temp21 = _mm_cvtsi128_si32(src_values1);
+ temp31 = _mm_cvtsi128_si32(src_values2);
+ temp41 = _mm_cvtsi128_si32(src_values3);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+ }
+ }
+}
+
diff --git a/common/x86/ihevc_intra_pred_filters_ssse3_intr.c b/common/x86/ihevc_intra_pred_filters_ssse3_intr.c
new file mode 100644
index 0000000..dbab80a
--- /dev/null
+++ b/common/x86/ihevc_intra_pred_filters_ssse3_intr.c
@@ -0,0 +1,5127 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_intra_pred_filters_atom_intr.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ihevc_intra_pred_luma_planar_ssse3()
+* - ihevc_intra_pred_luma_dc_ssse3()
+* - ihevc_intra_pred_luma_horz_ssse3()
+* - ihevc_intra_pred_luma_ver_ssse3()
+* - ihevc_intra_pred_luma_mode2_ssse3()
+* - ihevc_intra_pred_luma_mode_18_34_ssse3()
+* - ihevc_intra_pred_luma_mode_3_to_9_ssse3()
+* - ihevc_intra_pred_luma_mode_11_to_17_ssse3()
+* - ihevc_intra_pred_luma_mode_19_to_25_ssse3()
+* - ihevc_intra_pred_luma_mode_27_to_33_ssse3()
+* - ihevc_intra_pred_luma_ref_substitution_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdlib.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_tables_x86_intr.h"
+
+#include <immintrin.h>
+
+/****************************************************************************/
+/* Constant Macros */
+/****************************************************************************/
+#define MAX_CU_SIZE 64
+#define BIT_DEPTH 8
+#define T32_4NT 128
+#define T16_4NT 64
+
+
+/****************************************************************************/
+/* Function Macros */
+/****************************************************************************/
+#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
+
+/* tables to shuffle 8-bit values */
+
+
+/*****************************************************************************/
+/* global tables Definition */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Function Definition */
+/*****************************************************************************/
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for pu1_ref substitution
+*
+*
+* @par Description:
+* Reference substitution process for samples unavailable for prediction
+* Refer to section 8.4.4.2.2
+*
+* @param[in] pu1_top_left
+* UWORD8 pointer to the top-left
+*
+* @param[in] pu1_top
+* UWORD8 pointer to the top
+*
+* @param[in] pu1_left
+* UWORD8 pointer to the left
+*
+* @param[in] src_strd
+* WORD32 Source stride
+*
+* @param[in] nbr_flags
+* WORD32 neighbor availability flags
+*
+* @param[in] nt
+* WORD32 transform Block size
+*
+* @param[in] dst_strd
+* WORD32 Destination stride
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_ref_substitution_ssse3(UWORD8 *pu1_top_left,
+ UWORD8 *pu1_top,
+ UWORD8 *pu1_left,
+ WORD32 src_strd,
+ WORD32 nt,
+ WORD32 nbr_flags,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd)
+{
+ UWORD8 pu1_ref;
+ WORD32 dc_val, i;
+ WORD32 total_samples = (4 * nt) + 1;
+ WORD32 two_nt = 2 * nt;
+
+ WORD32 three_nt = 3 * nt;
+ WORD32 get_bits;
+ WORD32 next;
+ WORD32 bot_left, left, top, tp_right, tp_left;
+
+ WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
+ UNUSED(dst_strd);
+
+ dc_val = 1 << (BIT_DEPTH - 1);
+
+
+ /* Neighbor Flag Structure*/
+ /* MSB ---> LSB */
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ 1 4 4 4 4
+ */
+ /* If no neighbor flags are present, fill the neighbor samples with DC value */
+ if(nbr_flags == 0)
+ {
+ for(i = 0; i < total_samples; i++)
+ {
+ pu1_dst[i] = dc_val;
+ }
+ }
+ else
+ {
+ /* Else fill the corresponding samples */
+ pu1_dst[two_nt] = *pu1_top_left;
+ for(i = 0; i < two_nt; i++)
+ pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
+ for(i = 0; i < two_nt; i++)
+ pu1_dst[two_nt + 1 + i] = pu1_top[i];
+
+ if(nt <= 8)
+ {
+ /* 1 bit extraction for all the neighboring blocks */
+ tp_left = (nbr_flags & 0x10000) >> 16;
+ bot_left = (nbr_flags & 0x8) >> 3;
+ left = (nbr_flags & 0x80) >> 7;
+ top = (nbr_flags & 0x100) >> 8;
+ tp_right = (nbr_flags & 0x1000) >> 12;
+
+ next = 1;
+
+ /* If bottom -left is not available, reverse substitution process*/
+ if(bot_left == 0)
+ {
+ WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
+
+ /* Check for the 1st available sample from bottom-left*/
+ while(!a_nbr_flag[next])
+ next++;
+
+ /* If Left, top-left are available*/
+ if(next <= 2)
+ {
+ idx = nt * next;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ else /* If top, top-right are available */
+ {
+ /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
+ idx = (nt * (next - 1)) + 1;
+ pu1_ref = pu1_dst[idx];
+ for(i = 0; i < idx; i++)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* Forward Substitution Process */
+ /* If left is Unavailable, copy the last bottom-left value */
+ if(left == 0)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[nt + i] = pu1_dst[nt - 1];
+ }
+ /* If top-left is Unavailable, copy the last left value */
+ if(tp_left == 0)
+ pu1_dst[two_nt] = pu1_dst[two_nt - 1];
+ /* If top is Unavailable, copy the last top-left value */
+ if(top == 0)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
+ }
+ /* If to right is Unavailable, copy the last top value */
+ if(tp_right == 0)
+ {
+ for(i = 0; i < nt; i++)
+ pu1_dst[three_nt + 1 + i] = pu1_dst[three_nt];
+ }
+ }
+
+ if(nt == 16)
+ {
+ WORD32 nbr_flags_temp = 0;
+ nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
+ + ((nbr_flags & 0x300) >> 4)
+ + ((nbr_flags & 0x3000) >> 6)
+ + ((nbr_flags & 0x10000) >> 8);
+
+ /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ nbr_id_from_bl = 32;
+
+ if(nbr_id_from_bl == 32)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags_temp >> 8) & 0x1))
+ {
+ nbr_id_from_bl++;
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
+ //nbr_id_from_bl += idx * 8;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ {
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T16_4NT) + 1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T16_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags_temp, 8);
+
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ for(i = 0; i < 8; i++)
+ pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
+ }
+
+
+ }
+
+ if(nt == 32)
+ {
+ /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
+ /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
+ {
+ nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
+
+ if(nbr_id_from_bl == 64)
+ {
+ /* for top left : 1 pel per nbr bit */
+ if(!((nbr_flags >> 16) & 0x1))
+ {
+ /* top left not available */
+ nbr_id_from_bl++;
+ /* top and top right; 8 pels per nbr bit */
+ nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
+ }
+ }
+ /* Reverse Substitution Process*/
+ if(nbr_id_from_bl)
+ {
+ /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
+ pu1_ref = pu1_dst[nbr_id_from_bl];
+ for(i = (nbr_id_from_bl - 1); i >= 0; i--)
+ pu1_dst[i] = pu1_ref;
+ }
+ }
+
+ /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
+ while(nbr_id_from_bl < ((T32_4NT) + 1))
+ {
+ /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
+ /* Devide by 8 to obtain the original index */
+ frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
+
+ /* The Top-left flag is at the last bit location of nbr_flags*/
+ if(nbr_id_from_bl == (T32_4NT / 2))
+ {
+ get_bits = GET_BITS(nbr_flags, 16);
+ /* only pel substitution for TL */
+ if(!get_bits)
+ pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
+ }
+ else
+ {
+ get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
+ if(!get_bits)
+ {
+ /* 8 pel substitution (other than TL) */
+ pu1_ref = pu1_dst[nbr_id_from_bl - 1];
+ for(i = 0; i < 8; i++)
+ pu1_dst[nbr_id_from_bl + i] = pu1_ref;
+ }
+
+ }
+ nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
+ }
+ }
+
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for ref_filtering
+*
+*
+* @par Description:
+* Reference DC filtering for neighboring samples dependent on TU size and
+* mode Refer to section 8.4.4.2.3 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_ref_filtering_ssse3(UWORD8 *pu1_src,
+ WORD32 nt,
+ UWORD8 *pu1_dst,
+ WORD32 mode,
+ WORD32 strong_intra_smoothing_enable_flag)
+{
+ WORD32 filter_flag;
+ WORD32 i; /* Generic indexing variable */
+ WORD32 four_nt = 4 * nt;
+ UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
+ WORD32 bi_linear_int_flag = 0;
+ WORD32 abs_cond_left_flag = 0;
+ WORD32 abs_cond_top_flag = 0;
+ WORD32 dc_val = 1 << (BIT_DEPTH - 5);
+ __m128i src_temp1, src_temp2, src_temp3, src_temp7;
+ __m128i src_temp4, src_temp5, src_temp6, src_temp8;
+
+ //WORD32 strong_intra_smoothing_enable_flag = 1;
+
+ filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+ if(0 == filter_flag)
+ {
+ if(pu1_src == pu1_dst)
+ {
+ return;
+ }
+ else
+ {
+ if(nt == 4)
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ pu1_dst[four_nt] = pu1_src[four_nt];
+
+ }
+
+ else if(nt == 8)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+
+ pu1_dst[four_nt] = pu1_src[four_nt];
+ }
+ else if(nt == 16)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ pu1_dst[four_nt] = pu1_src[four_nt];
+ }
+ else if(nt == 32)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+ pu1_dst[four_nt] = pu1_src[four_nt];
+ }
+
+ }
+ }
+
+ else
+ {
+ /* If strong intra smoothin is enabled and transform size is 32 */
+ if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
+ {
+ /* Strong Intra Filtering */
+ abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt]
+ - (2 * pu1_src[3 * nt]))) < dc_val;
+ abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0]
+ - (2 * pu1_src[nt]))) < dc_val;
+
+ bi_linear_int_flag = ((1 == abs_cond_left_flag)
+ && (1 == abs_cond_top_flag));
+ }
+ /* Extremities Untouched*/
+ au1_flt[0] = pu1_src[0];
+ au1_flt[4 * nt] = pu1_src[4 * nt];
+
+ /* Strong filtering of reference samples */
+ if(1 == bi_linear_int_flag)
+ {
+ au1_flt[2 * nt] = pu1_src[2 * nt];
+
+ for(i = 1; i < (2 * nt); i++)
+ au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
+
+ for(i = 1; i < (2 * nt); i++)
+ au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
+ }
+ else
+ {
+ __m128i const_value_8x16, zero_8x16b;
+
+ const_value_8x16 = _mm_set1_epi16(2);
+
+ au1_flt[0] = pu1_src[0];
+ au1_flt[4 * nt] = pu1_src[4 * nt];
+
+ zero_8x16b = _mm_setzero_si128();
+
+ /* Perform bilinear filtering of Reference Samples */
+ for(i = 0; i < (four_nt); i += 16)
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i));
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp2, 1);
+
+ src_temp1 = _mm_unpacklo_epi8(src_temp1, zero_8x16b);
+ src_temp2 = _mm_unpacklo_epi8(src_temp2, zero_8x16b);
+ src_temp3 = _mm_unpacklo_epi8(src_temp3, zero_8x16b);
+
+ src_temp2 = _mm_slli_epi16(src_temp2, 1);
+
+ src_temp1 = _mm_add_epi16(src_temp1, src_temp2);
+ src_temp1 = _mm_add_epi16(src_temp1, src_temp3);
+ src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16);
+
+ src_temp1 = _mm_srai_epi16(src_temp1, 2);
+
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i));
+ src_temp5 = _mm_srli_si128(src_temp4, 1);
+ src_temp6 = _mm_srli_si128(src_temp5, 1);
+
+ src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b);
+ src_temp5 = _mm_unpacklo_epi8(src_temp5, zero_8x16b);
+ src_temp6 = _mm_unpacklo_epi8(src_temp6, zero_8x16b);
+
+ src_temp5 = _mm_slli_epi16(src_temp5, 1);
+
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
+ src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16);
+
+ src_temp4 = _mm_srai_epi16(src_temp4, 2);
+
+ /* converting 16 bit to 8 bit */
+ src_temp1 = _mm_packus_epi16(src_temp1, src_temp4);
+
+ _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1);
+ }
+ au1_flt[4 * nt] = pu1_src[4 * nt];
+ }
+
+ if(nt == 4)
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+ else if(nt == 8)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+ else if(nt == 16)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+
+ else if(nt == 32)
+ {
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
+ src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64));
+ src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80));
+ src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96));
+ src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
+
+ pu1_dst[four_nt] = au1_flt[four_nt];
+ }
+
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma planar
+*
+* @par Description:
+* Planar Intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.4 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_planar_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+
+ WORD32 row, col;
+ WORD32 two_nt, three_nt;
+ UWORD16 temp;
+
+ __m128i pu1_ref_16x8b, const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b, const_temp8_4x32b;
+ __m128i nt_row_16x8b, nt_row1_16x8b, nt_row2_16x8b, nt_row3_16x8b; //nt-1-row
+ __m128i row_16x8b, row1_16x8b, row2_16x8b, row3_16x8b; //row+1
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+
+ /* Planar filtering */
+ temp = pu1_ref[nt - 1];
+ temp = (temp << 8) | ((UWORD16)pu1_ref[three_nt + 1]);
+ /* setting vallues in registera*/
+ pu1_ref_16x8b = _mm_set1_epi16(temp);
+ const_temp6_4x32b = _mm_set1_epi16(nt);
+
+
+
+ if(nt == 32) /* for nt multiple of 8*/
+ {
+
+
+ const_temp4_4x32b = _mm_set1_epi16(0x0400);
+ const_temp1_4x32b = _mm_set1_epi16(0x0100);
+ const_temp8_4x32b = _mm_set1_epi16(0x0008);
+ //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row
+ //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
+ nt_row_16x8b = _mm_set_epi16(0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f);
+ //(y+1) (x+1) ; x= 0..15 , y = row
+ //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
+ row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
+
+ for(row = 0; row < nt; row += 1)
+ {
+ __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+ __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+
+ __m128i src_temp_8x16b, src_temp1_8x16b;
+
+
+ res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
+
+ nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp8_4x32b);
+ row1_16x8b = _mm_add_epi16(row_16x8b, const_temp8_4x32b);
+ nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b);
+ row2_16x8b = _mm_add_epi16(row1_16x8b, const_temp8_4x32b);
+ nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp8_4x32b);
+ row3_16x8b = _mm_add_epi16(row2_16x8b, const_temp8_4x32b);
+ /* loding 8bit 16 pixles*/
+ src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17));
+
+ res_temp4_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/
+ res_temp5_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=1*/
+ res_temp6_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=2*/
+ res_temp7_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=3*/
+
+ /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
+ res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
+ res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
+ res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
+ res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
+ /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
+ res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
+ res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
+ res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
+ res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
+
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
+ res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
+ res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
+ /*res_temp + nt)*/
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
+ res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
+ res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
+
+ res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 6); //log2(32)+1
+ res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 6);
+ res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 6);
+ res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 6);
+
+ res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, res_temp1_8x16b);
+ res_temp1_8x16b = _mm_packus_epi16(res_temp2_8x16b, res_temp3_8x16b);
+
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), res_temp1_8x16b);
+
+
+ nt_row_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b);
+ row_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b);
+ }
+ }
+ else if(nt == 16) /* for nt multiple of 8*/
+ {
+
+ const_temp4_4x32b = _mm_set1_epi16(0x0400);
+ const_temp1_4x32b = _mm_set1_epi16(0x0100);
+ const_temp8_4x32b = _mm_set1_epi16(0x0008);
+ //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row
+ //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
+ nt_row_16x8b = _mm_set_epi16(0x0f08, 0x0f09, 0x0f0a, 0x0f0b, 0x0f0c, 0x0f0d, 0x0f0e, 0x0f0f);
+ //(y+1) (x+1) ; x= 0..15 , y = row
+ //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
+ row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
+
+ for(row = 0; row < nt; row += 2)
+ {
+ __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+ __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+
+ __m128i src_temp_8x16b;
+
+
+ res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
+ res_temp2_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
+
+
+ nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b);
+ row1_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b);
+ nt_row2_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp8_4x32b);
+ row2_16x8b = _mm_add_epi16(row_16x8b, const_temp8_4x32b);
+ nt_row3_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b);
+ row3_16x8b = _mm_add_epi16(row1_16x8b, const_temp8_4x32b);
+ /* loding 8bit 16 pixles*/
+ src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+
+ res_temp4_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/
+ res_temp5_8x16b = _mm_unpacklo_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=1*/
+ res_temp6_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=2*/
+ res_temp7_8x16b = _mm_unpackhi_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=3*/
+
+ /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
+ res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
+ res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
+ res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
+ res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
+ /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
+ res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
+ res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
+ res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
+ res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
+
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
+ res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
+ res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
+ /*res_temp + nt)*/
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
+ res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
+ res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
+
+ res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 5); //log2(16)+1
+ res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 5);
+ res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 5);
+ res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 5);
+
+ res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, res_temp2_8x16b);
+ res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b);
+
+ nt_row_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b);
+ row_16x8b = _mm_add_epi16(row1_16x8b, const_temp1_4x32b);
+ }
+ }
+ else if(nt == 8)
+ {
+
+
+ const_temp4_4x32b = _mm_set1_epi16(0x0400);
+ const_temp1_4x32b = _mm_set1_epi16(0x0100);
+ zero_8x16b = _mm_set1_epi32(0);
+
+ //(nt-1-y) (nt-1-x) ; x= 0..7 , y = row
+ //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
+ nt_row_16x8b = _mm_set_epi16(0x0700, 0x0701, 0x0702, 0x0703, 0x0704, 0x0705, 0x0706, 0x0707);
+ //(y+1) (x+1) ; x= 0..7 , y = row
+ //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
+ row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
+
+ for(row = 0; row < nt; row += 4)
+ {
+ __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
+ __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
+
+ __m128i src_temp_8x16b;
+
+
+ res_temp4_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
+ res_temp5_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
+ res_temp6_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 3 - row]);
+ res_temp7_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 4 - row]);
+
+ nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b);
+ row1_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b);
+ nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b);
+ row2_16x8b = _mm_add_epi16(row1_16x8b, const_temp1_4x32b);
+ nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp1_4x32b);
+ row3_16x8b = _mm_add_epi16(row2_16x8b, const_temp1_4x32b);
+ /* loding 8bit 16 pixles*/
+ src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ res_temp4_8x16b = _mm_unpacklo_epi8(res_temp4_8x16b, src_temp_8x16b); /* row=0*/
+ res_temp5_8x16b = _mm_unpacklo_epi8(res_temp5_8x16b, src_temp_8x16b); /* row=1*/
+ res_temp6_8x16b = _mm_unpacklo_epi8(res_temp6_8x16b, src_temp_8x16b); /* row=2*/
+ res_temp7_8x16b = _mm_unpacklo_epi8(res_temp7_8x16b, src_temp_8x16b); /* row=3*/
+
+ /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
+ res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
+ res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
+ res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
+ res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
+ /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
+ res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
+ res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
+ res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
+ res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
+
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
+ res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
+ res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
+ /*res_temp + nt)*/
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
+ res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
+ res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
+
+ res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 4); //log2(16)+1
+ res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 4);
+ res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 4);
+ res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 4);
+
+ res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, zero_8x16b);
+ res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+ res_temp2_8x16b = _mm_packus_epi16(res_temp2_8x16b, zero_8x16b);
+ res_temp3_8x16b = _mm_packus_epi16(res_temp3_8x16b, zero_8x16b);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), res_temp2_8x16b);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), res_temp3_8x16b);
+
+ nt_row_16x8b = _mm_sub_epi16(nt_row3_16x8b, const_temp1_4x32b);
+ row_16x8b = _mm_add_epi16(row3_16x8b, const_temp1_4x32b);
+ }
+ }
+ else
+ {
+
+ /* for nt multiple of 4*/
+ const_temp7_4x32b = _mm_set1_epi16(4);
+ const_temp4_4x32b = _mm_set1_epi16(nt - 1);
+ const_temp_4x32b = _mm_set1_epi16(pu1_ref[three_nt + 1]);
+ const_temp1_4x32b = _mm_set1_epi16(pu1_ref[nt - 1]);
+ zero_8x16b = _mm_set1_epi32(0);
+
+ for(row = 0; row < nt; row++)
+ {
+ __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
+ __m128i res_temp3_8x16b;
+
+ const_temp2_4x32b = _mm_set1_epi16(pu1_ref[two_nt - 1 - row]);
+ const_temp3_4x32b = _mm_set1_epi16((row + 1));
+
+
+ row_8x16b = _mm_set1_epi16((nt - 1 - row));
+
+ const_temp5_4x32b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+ col_8x16b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1]*/
+ res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b);
+
+ /*(row + 1) * pu1_ref[nt - 1] + nt)*/
+ res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
+
+ for(col = 0; col < nt; col += 4)
+ {
+ __m128i src_temp_8x16b;
+ int temp1;
+
+ /* loding 8bit 16 pixles*/
+ src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + col));
+
+ src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b); /* row=0*/
+
+ /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
+ res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b);
+
+ /*(col + 1) * pu1_ref[three_nt + 1]*/
+ res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b);
+
+ /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
+ res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b);
+
+ res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
+ res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
+
+ res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 3); //log2(16)+1
+ res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
+
+ temp1 = _mm_cvtsi128_si32(res_temp1_8x16b);
+
+ *(WORD32 *)(&pu1_dst[(row * dst_strd) + col]) = temp1;
+
+ const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
+ col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
+ } /* inner loop ends here */
+ }
+ }
+
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma dc
+*
+* @par Description:
+* Intraprediction for DC mode with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.5 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_dc_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 acc_dc;
+ WORD32 dc_val, two_dc_val, three_dc_val;
+ WORD32 row;
+ WORD32 log2nt = 5;
+ WORD32 two_nt, three_nt;
+ __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6;
+ __m128i src_temp8, src_temp10, src_temp2;
+ __m128i m_zero = _mm_setzero_si128();
+ __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]);
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+
+ switch(nt)
+ {
+ case 32:
+ log2nt = 5;
+ break;
+ case 16:
+ log2nt = 4;
+ break;
+ case 8:
+ log2nt = 3;
+ break;
+ case 4:
+ log2nt = 2;
+ break;
+ default:
+ break;
+ }
+ two_nt = 2 * nt;
+ three_nt = 3 * nt;
+
+ acc_dc = 0;
+ /* Calculate DC value for the transform block */
+
+
+
+ if(nt == 32)
+ {
+ __m128i temp;
+ WORD32 itr_count;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
+
+ src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+ src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+ src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
+ src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
+
+ src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+ src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
+ src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
+
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+ /* computing acc_dc value */
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ two_dc_val = 2 * dc_val;
+ three_dc_val = 3 * dc_val;
+
+ temp = _mm_set1_epi8(dc_val);
+
+ for(itr_count = 0; itr_count < 2; itr_count++)
+ {
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp);
+
+ pu1_dst += 16 * dst_strd;
+ }
+ }
+ else
+
+ {
+ __m128i sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+ /* DC filtering for the first top row and first left column */
+
+
+
+ if(nt == 4) /* nt multiple of 4*/
+ {
+ WORD32 temp1, temp2, temp3;
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ src_temp4 = _mm_unpacklo_epi8(src_temp3, m_zero);
+ src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero);
+
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+/* computing acc_dc value */
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ three_dc_val = 3 * dc_val;
+
+ /* loding 8-bit 16 pixel */
+ src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+ two_dc_val = 2 * dc_val;
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+ src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
+ src_temp2 = _mm_srli_epi16(src_temp2, 2);
+
+ src_temp2 = _mm_packus_epi16(src_temp2, m_zero);
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+
+ *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+ src_temp2 = _mm_insert_epi16(src_temp2, dc_val, 0);
+
+ src_temp2 = _mm_shuffle_epi8(src_temp2, sm1);
+ src_temp3 = _mm_shuffle_epi8(src_temp2, sm1);
+ src_temp4 = _mm_shuffle_epi8(src_temp2, sm1);
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+ temp2 = _mm_cvtsi128_si32(src_temp3);
+ temp3 = _mm_cvtsi128_si32(src_temp4);
+
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+ /* retore first value*/
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+
+ }
+ else if(nt == 8) /* if nt%8==0*/
+ {
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+
+ src_temp4 = _mm_sad_epu8(src_temp3, m_zero);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+ /* computing acc_dc value */
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ three_dc_val = 3 * dc_val;
+ src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+ two_dc_val = 2 * dc_val;
+
+ /* loding 8-bit 16 pixel */
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero);
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+ src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+ src_temp2 = _mm_srli_epi16(src_temp2, 2);
+ src_temp2 = _mm_packus_epi16(src_temp2, m_zero);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2);
+
+ /* Fill the remaining rows with DC value*/
+
+ src_temp1 = _mm_set1_epi8(dc_val);
+ src_temp2 = _mm_set1_epi8(dc_val);
+ src_temp3 = _mm_set1_epi8(dc_val);
+ src_temp4 = _mm_set1_epi8(dc_val);
+ src_temp5 = _mm_set1_epi8(dc_val);
+ src_temp6 = _mm_set1_epi8(dc_val);
+ src_temp7 = _mm_set1_epi8(dc_val);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+ /* retore first value*/
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+
+ }
+ else /* if nt == 16*/
+ {
+
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
+
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+
+ src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
+ src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
+
+ src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero);
+ src_temp10 = _mm_unpacklo_epi8(src_temp10, m_zero);
+
+ src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
+
+ acc_dc = _mm_cvtsi128_si32(src_temp4);
+
+ acc_dc += pu1_ref[three_nt];
+ acc_dc -= pu1_ref[two_nt];
+
+ /* computing acc_dc value */
+
+ dc_val = (acc_dc + nt) >> (log2nt + 1);
+
+ three_dc_val = 3 * dc_val;
+ src_temp1 = _mm_set1_epi16(three_dc_val + 2);
+ two_dc_val = 2 * dc_val;
+
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
+ src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
+ src_temp10 = _mm_add_epi16(src_temp10, src_temp1);
+ /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
+ src_temp2 = _mm_srli_epi16(src_temp2, 2);
+ src_temp10 = _mm_srli_epi16(src_temp10, 2);
+
+ src_temp2 = _mm_packus_epi16(src_temp2, src_temp10);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
+
+ /* Fill the remaining rows with DC value*/
+ src_temp1 = _mm_set1_epi8(dc_val);
+ src_temp2 = _mm_set1_epi8(dc_val);
+ src_temp3 = _mm_set1_epi8(dc_val);
+ src_temp4 = _mm_set1_epi8(dc_val);
+ src_temp5 = _mm_set1_epi8(dc_val);
+ src_temp6 = _mm_set1_epi8(dc_val);
+ src_temp7 = _mm_set1_epi8(dc_val);
+
+ for(row = 1; row < nt; row += 8)
+ {
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1);
+
+ }
+
+ /* retore first value*/
+ pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
+ >> 2);
+
+ for(row = 1; row < nt; row++)
+ pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
+ >> 2;
+
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for horizontal luma variable.
+*
+* @par Description:
+* Horizontal intraprediction(mode 10) with reference samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_horz_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row;
+ WORD32 two_nt;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+ two_nt = 2 * nt;
+
+
+ if(nt == 32)
+ {
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+ __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+ __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
+
+ for(row = 0; row < nt; row += 16)
+ {
+ {
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
+
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp1, 2);
+ src_temp4 = _mm_srli_si128(src_temp1, 3);
+ src_temp5 = _mm_srli_si128(src_temp1, 4);
+ src_temp6 = _mm_srli_si128(src_temp1, 5);
+ src_temp7 = _mm_srli_si128(src_temp1, 6);
+ src_temp8 = _mm_srli_si128(src_temp1, 7);
+
+ src_temp9 = _mm_srli_si128(src_temp1, 8);
+ src_temp10 = _mm_srli_si128(src_temp1, 9);
+ src_temp11 = _mm_srli_si128(src_temp1, 10);
+ src_temp12 = _mm_srli_si128(src_temp1, 11);
+ src_temp13 = _mm_srli_si128(src_temp1, 12);
+ src_temp14 = _mm_srli_si128(src_temp1, 13);
+ src_temp15 = _mm_srli_si128(src_temp1, 14);
+ src_temp16 = _mm_srli_si128(src_temp1, 15);
+
+ src_temp8 = _mm_shuffle_epi8(src_temp8, sm);
+ src_temp7 = _mm_shuffle_epi8(src_temp7, sm);
+ src_temp6 = _mm_shuffle_epi8(src_temp6, sm);
+ src_temp5 = _mm_shuffle_epi8(src_temp5, sm);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
+ src_temp3 = _mm_shuffle_epi8(src_temp3, sm);
+ src_temp2 = _mm_shuffle_epi8(src_temp2, sm);
+ src_temp1 = _mm_shuffle_epi8(src_temp1, sm);
+
+ src_temp16 = _mm_shuffle_epi8(src_temp16, sm);
+ src_temp15 = _mm_shuffle_epi8(src_temp15, sm);
+ src_temp14 = _mm_shuffle_epi8(src_temp14, sm);
+ src_temp13 = _mm_shuffle_epi8(src_temp13, sm);
+ src_temp12 = _mm_shuffle_epi8(src_temp12, sm);
+ src_temp11 = _mm_shuffle_epi8(src_temp11, sm);
+ src_temp10 = _mm_shuffle_epi8(src_temp10, sm);
+ src_temp9 = _mm_shuffle_epi8(src_temp9, sm);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1);
+
+ }
+
+ }
+
+ }
+ else
+
+ {
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6;
+ __m128i src_temp10, zero_8x16b, src_temp7;
+
+ /* DC filtering for the first top row and first left column */
+
+ zero_8x16b = _mm_set1_epi16(0);
+
+ /*Filtering done for the 1st row */
+
+ src_temp2 = _mm_set1_epi16(pu1_ref[two_nt - 1]);
+ src_temp10 = _mm_set1_epi16(pu1_ref[two_nt]);
+
+ /* loding 8-bit 16 pixels */
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b);
+
+ /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
+ src_temp3 = _mm_sub_epi16(src_temp4, src_temp10);
+
+ /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+ src_temp3 = _mm_srai_epi16(src_temp3, 1);
+
+ /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
+ src_temp3 = _mm_add_epi16(src_temp2, src_temp3);
+
+ if(nt == 4)
+ {
+ int temp1, temp2, temp3;
+ src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b);
+ temp1 = _mm_cvtsi128_si32(src_temp3);
+
+ *(WORD32 *)(&pu1_dst[0]) = temp1;
+
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+ temp2 = _mm_cvtsi128_si32(src_temp3);
+ temp3 = _mm_cvtsi128_si32(src_temp4);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
+
+ }
+ else if(nt == 8)
+ {
+ src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b);
+
+
+ src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]);
+ src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]);
+ src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]);
+ src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7);
+
+ }
+ else if(nt == 16)
+ {
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
+ src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b);
+ //src_temp4 = _mm_cvtepu8_epi16 (src_temp4);
+
+ src_temp10 = _mm_sub_epi16(src_temp4, src_temp10);
+ src_temp10 = _mm_srai_epi16(src_temp10, 1);
+ src_temp10 = _mm_add_epi16(src_temp2, src_temp10);
+
+ src_temp3 = _mm_packus_epi16(src_temp3, src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]);
+ src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]);
+ src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]);
+ src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]);
+ src_temp10 = _mm_set1_epi8(pu1_ref[two_nt - 9]);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10);
+
+ src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 10]);
+ src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 11]);
+ src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 12]);
+ src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 13]);
+ src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 14]);
+ src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 15]);
+ src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 16]);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7);
+
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for vertical luma variable.
+*
+* @par Description:
+* Horizontal intraprediction with reference neighboring samples location
+* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
+* to section 8.4.4.2.6 in the standard (Special case)
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_ver_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row;
+ WORD16 s2_predpixel;
+ WORD32 two_nt = 2 * nt;
+ __m128i src_temp0, src_temp2;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+
+ if(nt == 32)
+ {
+ __m128i temp1, temp2;
+ WORD32 itr_count;
+
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+ temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
+
+ for(itr_count = 0; itr_count < 2; itr_count++)
+ {
+ /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
+
+ pu1_dst += 16 * dst_strd;
+ }
+ }
+ else
+ {
+ /* Replication to next columns*/
+
+ if(nt == 4)
+ {
+ int temp1;
+
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ temp1 = _mm_cvtsi128_si32(src_temp2);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp1;
+
+ }
+ else if(nt == 8)
+ {
+
+ src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp0);
+
+
+ }
+ else if(nt == 16)
+ {
+ for(row = 0; row < nt; row += 8)
+ {
+
+ src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp0);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp0);
+
+ }
+
+ }
+
+ /*Filtering done for the 1st column */
+ for(row = nt - 1; row >= 0; row--)
+ {
+ s2_predpixel = pu1_ref[two_nt + 1]
+ + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
+ pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
+ }
+
+
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode2.
+*
+* @par Description:
+* Intraprediction for mode 2 (sw angle) with reference neighboring samples
+* location pointed by 'pu1_ref' to the TU block location pointed by
+* 'pu1_dst' Refer to section 8.4.4.2.6 in the standard
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode2_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 two_nt = 2 * nt;
+
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+ __m128i sm1, sm2, sm3;
+ UNUSED(src_strd);
+ UNUSED(mode);
+
+
+ sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY1[0]);
+ sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY2[0]);
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY3[0]);
+
+ /* For the angle 45, replication is done from the corresponding angle */
+ /* intra_pred_ang = tan(angle) in q5 format */
+
+ if(nt == 4)
+ {
+ int temp1, temp2, temp3, temp4;
+
+ /*pu1_ref[two_nt - row - (col+1) - 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 8));
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp1, 2);
+ src_temp4 = _mm_srli_si128(src_temp1, 3);
+
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm1);
+ src_temp3 = _mm_shuffle_epi8(src_temp3, sm1);
+ src_temp2 = _mm_shuffle_epi8(src_temp2, sm1);
+ src_temp1 = _mm_shuffle_epi8(src_temp1, sm1);
+
+ temp1 = _mm_cvtsi128_si32(src_temp4);
+ temp2 = _mm_cvtsi128_si32(src_temp3);
+ temp3 = _mm_cvtsi128_si32(src_temp2);
+ temp4 = _mm_cvtsi128_si32(src_temp1);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+
+ }
+ else if(nt == 8)
+ {
+ /*pu1_ref[two_nt - row - (col+1) - 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16));
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp1, 2);
+ src_temp4 = _mm_srli_si128(src_temp1, 3);
+ src_temp5 = _mm_srli_si128(src_temp1, 4);
+ src_temp6 = _mm_srli_si128(src_temp1, 5);
+ src_temp7 = _mm_srli_si128(src_temp1, 6);
+ src_temp8 = _mm_srli_si128(src_temp1, 7);
+
+ src_temp1 = _mm_shuffle_epi8(src_temp1, sm2);
+ src_temp2 = _mm_shuffle_epi8(src_temp2, sm2);
+ src_temp3 = _mm_shuffle_epi8(src_temp3, sm2);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm2);
+ src_temp5 = _mm_shuffle_epi8(src_temp5, sm2);
+ src_temp6 = _mm_shuffle_epi8(src_temp6, sm2);
+ src_temp7 = _mm_shuffle_epi8(src_temp7, sm2);
+ src_temp8 = _mm_shuffle_epi8(src_temp8, sm2);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+ }
+ else
+ {
+ for(row = 0; row < nt; row += 8)
+ {
+ for(col = 0; col < nt; col += 16)
+ { /*pu1_ref[two_nt - row - (col+1) - 1]*/
+
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0) - (col + 16) - 1));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1) - (col + 16) - 1));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2) - (col + 16) - 1));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3) - (col + 16) - 1));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4) - (col + 16) - 1));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5) - (col + 16) - 1));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6) - (col + 16) - 1));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7) - (col + 16) - 1));
+
+ src_temp1 = _mm_shuffle_epi8(src_temp1, sm3);
+ src_temp2 = _mm_shuffle_epi8(src_temp2, sm3);
+ src_temp3 = _mm_shuffle_epi8(src_temp3, sm3);
+ src_temp4 = _mm_shuffle_epi8(src_temp4, sm3);
+ src_temp5 = _mm_shuffle_epi8(src_temp5, sm3);
+ src_temp6 = _mm_shuffle_epi8(src_temp6, sm3);
+ src_temp7 = _mm_shuffle_epi8(src_temp7, sm3);
+ src_temp8 = _mm_shuffle_epi8(src_temp8, sm3);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), src_temp8);
+ }
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 18 & mode 34.
+*
+* @par Description:
+* Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_18_34_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row;
+ WORD32 two_nt = 2 * nt;
+ __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
+ UNUSED(src_strd);
+ if(mode == 34)
+ {
+ if(nt == 4)
+ {
+
+ int temp1, temp2, temp3, temp4;
+
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
+
+ temp1 = _mm_cvtsi128_si32(src_temp1);
+ temp2 = _mm_cvtsi128_si32(src_temp2);
+ temp3 = _mm_cvtsi128_si32(src_temp3);
+ temp4 = _mm_cvtsi128_si32(src_temp4);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+ }
+ else if(nt == 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 6));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 7));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 8));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 9));
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
+
+ }
+ else if(nt == 16)
+ {
+ for(row = 0; row < nt; row += 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 0) + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 1) + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 2) + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 3) + 2));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 4) + 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 5) + 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 6) + 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 7) + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8);
+
+
+ }
+ }
+ else
+ {
+ __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+ for(row = 0; row < nt; row += 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 0) + 2));
+ src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 16) + 2));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 0) + 2));
+ src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 16) + 2));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 0) + 2));
+ src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 16) + 2));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 0) + 2));
+ src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 16) + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 0) + 2));
+ src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 16) + 2));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 0) + 2));
+ src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 16) + 2));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 0) + 2));
+ src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 16) + 2));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 0) + 2));
+ src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 16) + 2));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+ pu1_ref += 8;
+ pu1_dst += 8 * dst_strd;
+ }
+ }
+ }
+ else
+ {
+ if(nt == 4)
+ {
+ int temp1, temp2, temp3, temp4;
+
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3));
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp1, 2);
+ src_temp4 = _mm_srli_si128(src_temp1, 3);
+
+ temp1 = _mm_cvtsi128_si32(src_temp4);
+ temp2 = _mm_cvtsi128_si32(src_temp3);
+ temp3 = _mm_cvtsi128_si32(src_temp2);
+ temp4 = _mm_cvtsi128_si32(src_temp1);
+
+ /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
+
+ }
+ else if(nt == 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7));
+ src_temp2 = _mm_srli_si128(src_temp1, 1);
+ src_temp3 = _mm_srli_si128(src_temp1, 2);
+ src_temp4 = _mm_srli_si128(src_temp1, 3);
+ src_temp5 = _mm_srli_si128(src_temp1, 4);
+ src_temp6 = _mm_srli_si128(src_temp1, 5);
+ src_temp7 = _mm_srli_si128(src_temp1, 6);
+ src_temp8 = _mm_srli_si128(src_temp1, 7);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2);
+ _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
+
+
+ }
+ else if(nt == 16)
+ {
+ for(row = 0; row < nt; row += 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0)));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1)));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2)));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3)));
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4)));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5)));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6)));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7)));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8);
+
+ }
+
+ }
+ else
+ {
+ __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
+ for(row = 0; row < nt; row += 8)
+ {
+ /*pu1_ref[two_nt + col + idx + 1]*/
+ src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 0));
+ src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 16));
+ src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 0));
+ src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 16));
+ src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 0));
+ src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 16));
+ src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 0));
+ src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 16));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
+
+ src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 0));
+ src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 16));
+ src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 0));
+ src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 16));
+ src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 0));
+ src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 16));
+ src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 0));
+ src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 16));
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
+
+ pu1_ref -= 8;
+ pu1_dst += 8 * dst_strd;
+ }
+ }
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 3 to mode 9
+*
+* @par Description:
+* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row, col;
+ WORD32 two_nt = 2 * nt;
+ WORD32 intra_pred_ang;
+
+
+ __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b, zero_8x16b;
+ __m128i fract_4x32b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+ UNUSED(src_strd);
+
+ /* Intra Pred Angle according to the mode */
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+
+ const_temp_4x32b = _mm_set1_epi16(16);
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+ const_temp4_4x32b = _mm_set1_epi32(4);
+
+ two_nt_4x32b = _mm_set1_epi32(two_nt - nt);
+
+
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+ if(nt == 4)
+ {
+
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ int temp11, temp21, temp31, temp41;
+ // WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, sign_8x16b;
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+ row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+ zero_8x16b = _mm_setzero_si128();
+ sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+ res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b);
+ fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4);
+ src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12);
+
+ temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+ temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+ }
+
+ else if(nt == 16 || nt == 32)
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(two_nt);
+
+ for(col = 0; col < nt; col += 8)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+ fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 8)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
+ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
+ src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b); /* row=7*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b); /* row=3*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b); /* row=0*/
+
+ }
+ }
+ }
+ else
+ {
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(two_nt - nt);
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
+ fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
+ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
+ src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b); /* row=3*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b); /* row=4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b); /* row=6*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b); /* row=7*/
+
+ }
+ }
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 11 to mode 17
+*
+* @par Description:
+* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
+* with reference neighboring samples location pointed by 'pu1_ref' to the
+* TU block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
+ /* for ref main & side samples assignment,can be combined for */
+ /* optimzation*/
+
+ WORD32 row, col, k;
+ WORD32 two_nt;
+ WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
+ WORD32 ref_idx;
+
+ __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
+ __m128i fract_4x32b, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
+
+
+ UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2];
+ UWORD8 *ref_main;
+ UWORD8 *ref_temp;
+ UNUSED(src_strd);
+ inv_ang_sum = 128;
+ two_nt = 2 * nt;
+ ref_temp = ref_tmp + 1;
+ ref_main = ref_temp + nt - 1;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ /* For the angles other then 45 degree, interpolation btw 2 neighboring */
+ /* samples dependent on distance to obtain destination sample */
+ const_temp_4x32b = _mm_set1_epi16(16);
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+ const_temp4_4x32b = _mm_set1_epi32(4);
+
+ two_nt_4x32b = _mm_set1_epi32(1);
+
+
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi32(4, 3, 2, 1);
+
+ if(nt == 4)
+ {
+
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ int temp11, temp21, temp31, temp41;
+// WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, zero_8x16b, sign_8x16b;
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
+
+ ref_main = ref_temp + nt - 1;
+ for(k = 0; k < nt + 1; k++)
+ ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
+
+ ref_main = ref_temp + nt - 1;
+ ref_idx = (nt * intra_pred_ang) >> 5;
+ zero_8x16b = _mm_setzero_si128();
+
+ row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
+ }
+
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+ res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b);
+ fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4);
+ src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8);
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12);
+
+ temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
+ temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
+
+ /* loding 8-bit 4 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+ }
+
+ else if(nt == 32)
+ {
+
+
+ __m128i temp1, temp2, temp3, temp11, temp12;
+ __m128i src_values0, src_values1;
+ /* Intermediate reference samples for negative angle modes */
+
+ ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+ temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
+ temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+
+ /* For negative angled derive the main reference samples from side */
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode]));
+ temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+ src_values1 = _mm_shuffle_epi8(src_values1, temp2);
+ src_values0 = _mm_shuffle_epi8(src_values0, temp12);
+ src_values1 = _mm_shuffle_epi8(src_values1, temp11);
+
+ temp1 = _mm_shuffle_epi8(temp1, temp2);
+ temp3 = _mm_shuffle_epi8(temp3, temp2);
+
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1);
+ _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0);
+ _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1);
+
+
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ for(col = 0; col < nt; col += 8)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+ fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 8)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/
+
+ }
+ }
+ }
+ else if(nt == 16)
+ {
+
+ __m128i temp1, temp2, temp11, src_values0;
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
+ temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+ temp1 = _mm_shuffle_epi8(temp1, temp2);
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+ _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ for(col = 0; col < nt; col += 8)
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+ fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ for(row = 0; row < nt; row += 8)
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
+
+ src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
+ src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
+ src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
+ src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/
+
+ }
+ }
+ }
+ else
+ {
+
+
+ __m128i temp1, temp2, temp11, src_values0;
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
+
+ /* For negative angled derive the main reference samples from side */
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
+ temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp2);
+ temp1 = _mm_shuffle_epi8(temp1, temp2);
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+ src_values0 = _mm_srli_si128(src_values0, 8);
+
+ _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+ _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp4_4x32b = _mm_set1_epi16(8);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ {
+ WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
+ WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
+ //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
+
+ __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
+
+ __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
+ __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* idx = pos >> 5; */
+ fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /* fract = pos & (31); */
+ ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /*(32 - fract) */
+ fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
+
+ fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
+ fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
+
+ fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
+ fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
+
+ fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
+ fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
+
+ temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00);
+ temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55);
+ temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa);
+ temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff);
+
+ temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00);
+ temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55);
+ temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa);
+ temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff);
+
+ pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
+ pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
+ pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
+ pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
+
+ pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
+ pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
+ pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
+ pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
+
+ {
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
+ __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
+
+ /* loding 8-bit 16 pixels */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
+
+ /* loding 8-bit 16 pixels */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/
+ src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/
+ src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/
+
+ src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
+ src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
+ src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
+ src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
+
+ src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
+ src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
+ src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
+ src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
+ src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
+ src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
+ src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
+
+ /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
+ src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
+ src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
+ src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
+ src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
+ src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
+ src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
+ src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* row=0*/
+ src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* row=1*/
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* row=2*/
+ src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* row=3*/
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
+ src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
+ src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
+ src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
+ src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
+ src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
+ src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
+
+ /* converting 16 bit to 8 bit */
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/
+ src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/
+
+ src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
+
+ src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
+
+ src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
+ src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+
+ src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
+ src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
+
+ src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
+ src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
+
+ src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+ src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+ src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b); /* row=0*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b); /* row=1*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b); /* row=2*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b); /* row=3*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b); /* row=4*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b); /* row=5*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b); /* row=6*/
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b); /* row=7*/
+
+ }
+ }
+ }
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 19 to mode 25
+*
+* @par Description:
+* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_luma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+
+ WORD32 row, k;
+ WORD32 two_nt, intra_pred_ang;
+ WORD32 inv_ang, inv_ang_sum;
+ //WORD32 ref_main_idx, pos, fract, idx;
+ WORD32 ref_idx;
+ UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2];
+ UWORD8 *ref_main, *ref_temp;
+
+ __m128i /*fract_8x16b,*/ const_temp_8x16b, sm3;
+ __m128i temp1, temp2, temp3, temp4;
+ __m128i temp11, temp12, temp13, temp14;
+ UNUSED(src_strd);
+ two_nt = 2 * nt;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+ inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp = ref_tmp + 1;
+ ref_main = ref_temp + nt - 1;
+
+
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+
+
+
+ const_temp_8x16b = _mm_set1_epi16(16);
+
+ if(nt == 32)
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+ WORD32 col = 0;
+
+ /* Intermediate reference samples for negative angle modes */
+ /* This have to be removed during optimization*/
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+ temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
+
+ /* SIMD Optimization can be done using look-up table for the loop */
+ /* For negative angled derive the main reference samples from side */
+ /* reference samples refer to section 8.4.4.2.6 */
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19]));
+ temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+ src_values1 = _mm_shuffle_epi8(src_values1, temp12);
+
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3);
+ _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1);
+ _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0);
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+ for(col = 0; col < nt; col += 16)
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+
+ }
+ else if(nt == 16) /* for nt = 16 case */
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+ temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+
+ _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
+ _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+ }
+ else if(nt == 8)
+ {
+
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+
+ /* Intermediate reference samples for negative angle modes */
+ /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
+ ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
+ temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
+
+ /* For negative angled derive the main reference samples from side */
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
+
+ temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, temp11);
+ src_values0 = _mm_srli_si128(src_values0, 8);
+ _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
+ _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
+
+
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+
+
+ two_nt_4x32b = _mm_set1_epi16(1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); /* col = 24-31 */
+ src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); /* col = 32-39 */
+ src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); /* col = 40-47 */
+ src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); /* col = 48-55 */
+ src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); /* col = 56-63*/
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+ src_values4 = _mm_packus_epi16(src_values4, src_values5);
+ src_values6 = _mm_packus_epi16(src_values6, src_values7);
+ src_values5 = _mm_srli_si128(src_values4, 8);
+ src_values7 = _mm_srli_si128(src_values6, 8);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/
+ }
+ }
+ else /* if nt =4*/
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b, sign_8x16b;
+
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ for(k = 0; k < (nt + 1); k++)
+ ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
+ ref_idx = (nt * intra_pred_ang) >> 5;
+ inv_ang_sum = 128;
+
+ for(k = -1; k > ref_idx; k--)
+ {
+ inv_ang_sum += inv_ang;
+ ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
+ }
+
+
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+ zero_8x16b = _mm_setzero_si128();
+ two_nt_4x32b = _mm_set1_epi32(1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ {
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+ int temp11, temp21, temp31, temp41;
+
+
+ __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+ sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+ res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+
+ temp11 = _mm_cvtsi128_si32(src_values0);
+ temp21 = _mm_cvtsi128_si32(src_values1);
+ temp31 = _mm_cvtsi128_si32(src_values2);
+ temp41 = _mm_cvtsi128_si32(src_values3);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+ }
+ }
+}
+
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Intra prediction interpolation filter for luma mode 27 to mode 33
+*
+* @par Description:
+* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
+* reference neighboring samples location pointed by 'pu1_ref' to the TU
+* block location pointed by 'pu1_dst'
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+* UWORD8 pointer to the destination
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] dst_strd
+* integer destination stride
+*
+* @param[in] nt
+* integer Transform Block size
+*
+* @param[in] mode
+* integer intraprediction mode
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_intra_pred_luma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode)
+{
+ WORD32 row;
+ WORD32 two_nt;
+ WORD32 intra_pred_ang;
+
+ __m128i temp11, temp12, temp13, temp14;
+
+ __m128i const_temp_8x16b;
+ __m128i temp1, temp2, temp3, temp4, sm3;
+ UNUSED(src_strd);
+ two_nt = 2 * nt;
+ intra_pred_ang = gai4_ihevc_ang_table[mode];
+
+ const_temp_8x16b = _mm_set1_epi16(16);
+ sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
+ if(nt == 32)
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+ int col = 0;
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+ for(col = 0; col < nt; col += 16)
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+
+ }
+ else if(nt == 16) /* for nt = 16 case */
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+ const_temp8_4x32b = _mm_set1_epi16(8);
+
+ two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ for(row = 0; row < nt; row += 8)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ {
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp1);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp2);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp3);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/
+
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp11);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp12);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp13);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp14);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values4);
+ src_values1 = _mm_packus_epi16(src_values1, src_values5);
+ src_values2 = _mm_packus_epi16(src_values2, src_values6);
+ src_values3 = _mm_packus_epi16(src_values3, src_values7);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/
+
+ }
+ pu1_dst += 8 * dst_strd;
+ }
+
+ }
+ else if(nt == 8)
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+ __m128i row_4x32b, two_nt_4x32b, src_values12;
+
+
+ const_temp2_4x32b = _mm_set1_epi16(31);
+ const_temp3_4x32b = _mm_set1_epi16(32);
+
+ two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+
+ row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+
+ //for(row = 0; row < nt; row +=4)
+ {
+
+ WORD16 ref_main_idx[9];
+
+ __m128i res_temp5_4x32b;
+ __m128i fract1_8x16b, fract2_8x16b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i src_values4, src_values5, src_values6, src_values7;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55);
+ temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff);
+
+ temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55);
+ temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+ temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff);
+
+ _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); /* col = 24-31 */
+ src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); /* col = 32-39 */
+ src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); /* col = 40-47 */
+ src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); /* col = 48-55 */
+ src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); /* col = 56-63*/
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+ src_values4 = _mm_shuffle_epi8(src_values4, sm3);
+ src_values5 = _mm_shuffle_epi8(src_values5, sm3);
+ src_values6 = _mm_shuffle_epi8(src_values6, sm3);
+ src_values7 = _mm_shuffle_epi8(src_values7, sm3);
+
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+ src_values4 = _mm_maddubs_epi16(src_values4, temp11);
+ src_values5 = _mm_maddubs_epi16(src_values5, temp12);
+ src_values6 = _mm_maddubs_epi16(src_values6, temp13);
+ src_values7 = _mm_maddubs_epi16(src_values7, temp14);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+ src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
+ src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
+ src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
+ src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+ src_values4 = _mm_srai_epi16(src_values4, 5);
+ src_values5 = _mm_srai_epi16(src_values5, 5);
+ src_values6 = _mm_srai_epi16(src_values6, 5);
+ src_values7 = _mm_srai_epi16(src_values7, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+ src_values4 = _mm_packus_epi16(src_values4, src_values5);
+ src_values6 = _mm_packus_epi16(src_values6, src_values7);
+ src_values5 = _mm_srli_si128(src_values4, 8);
+ src_values7 = _mm_srli_si128(src_values6, 8);
+
+ /* loading 8-bit 8 pixels values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/
+ }
+
+ }
+ else /* if nt =4*/
+ {
+
+ __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b;
+ __m128i src_values10, src_values11, intra_pred_ang_4x32b;
+
+ __m128i row_4x32b, two_nt_4x32b, src_values12, sign_8x16b;
+
+
+ const_temp2_4x32b = _mm_set1_epi32(31);
+ const_temp3_4x32b = _mm_set1_epi32(32);
+ zero_8x16b = _mm_setzero_si128();
+ two_nt_4x32b = _mm_set1_epi32(two_nt + 1);
+
+
+ /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
+ row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
+ intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
+ {
+ int temp11, temp21, temp31, temp41;
+
+ WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
+
+ __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
+ __m128i src_values0, src_values1, src_values2, src_values3;
+ __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
+
+ /* pos = ((row + 1) * intra_pred_ang); */
+ res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
+ sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
+ res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
+
+ /* fract = pos & (31); */
+ src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
+
+ ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
+ ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
+ ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
+ ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
+ ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
+ ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
+ ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
+
+ /* idx = pos >> 5; */
+ src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
+
+ /*(32 - fract) */
+ src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
+
+ fract1_8x16b = _mm_slli_epi16(src_values11, 8);
+ fract2_8x16b = _mm_slli_epi16(src_values10, 8);
+
+ src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
+ src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
+
+ fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
+ fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
+
+ temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00);
+ temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa);
+ temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00);
+ temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa);
+
+ src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
+ src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */
+ src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */
+ src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */
+
+ src_values0 = _mm_shuffle_epi8(src_values0, sm3);
+ src_values1 = _mm_shuffle_epi8(src_values1, sm3);
+ src_values2 = _mm_shuffle_epi8(src_values2, sm3);
+ src_values3 = _mm_shuffle_epi8(src_values3, sm3);
+
+ src_values0 = _mm_maddubs_epi16(src_values0, temp1);
+ src_values1 = _mm_maddubs_epi16(src_values1, temp2);
+ src_values2 = _mm_maddubs_epi16(src_values2, temp3);
+ src_values3 = _mm_maddubs_epi16(src_values3, temp4);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
+ src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
+ src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
+ src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
+ src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
+
+ /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
+ src_values0 = _mm_srai_epi16(src_values0, 5);
+ src_values1 = _mm_srai_epi16(src_values1, 5);
+ src_values2 = _mm_srai_epi16(src_values2, 5);
+ src_values3 = _mm_srai_epi16(src_values3, 5);
+
+ /* converting 16 bit to 8 bit */
+ src_values0 = _mm_packus_epi16(src_values0, src_values1);
+ src_values2 = _mm_packus_epi16(src_values2, src_values3);
+ src_values1 = _mm_srli_si128(src_values0, 8);
+ src_values3 = _mm_srli_si128(src_values2, 8);
+
+ temp11 = _mm_cvtsi128_si32(src_values0);
+ temp21 = _mm_cvtsi128_si32(src_values1);
+ temp31 = _mm_cvtsi128_si32(src_values2);
+ temp41 = _mm_cvtsi128_si32(src_values3);
+
+ /* loding 4-bit 8 pixels values */
+ *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
+ *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
+ *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
+ *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
+
+ }
+ }
+}
diff --git a/common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c b/common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c
new file mode 100644
index 0000000..63cc1ef
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c
@@ -0,0 +1,3340 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_iquant_itrans_recon_atom_intr.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * 100470
+ * 100592 (edited by)
+ *
+ * @par List of Functions:
+ * - ihevc_iquant_itrans_recon_16x16_ssse3()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+
+
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#include <tmmintrin.h>
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 16x16 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_26;
+ __m128i m_temp_reg_27;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_temp_reg_37;
+ __m128i m_temp_reg_40;
+ __m128i m_temp_reg_41;
+ __m128i m_temp_reg_42;
+ __m128i m_temp_reg_43;
+ __m128i m_temp_reg_44;
+ __m128i m_temp_reg_45;
+ __m128i m_temp_reg_46;
+ __m128i m_temp_reg_47;
+
+ __m128i m_temp_reg_70;
+ __m128i m_temp_reg_71;
+ __m128i m_temp_reg_72;
+ __m128i m_temp_reg_73;
+ __m128i m_temp_reg_74;
+ __m128i m_temp_reg_75;
+ __m128i m_temp_reg_76;
+ __m128i m_temp_reg_77;
+ __m128i m_rdng_factor;
+ __m128i m_count;
+ __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+ __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+ WORD32 i;
+/*Lokesh*/
+ WORD32 zero_last8_cols_stg1;
+ WORD32 zero_last8_rows_stg1;
+ WORD32 zero_last12_rows_stg1;
+ WORD32 zero_last12_rows_stg2;
+ WORD32 zero_last8_rows_stg2;
+
+ WORD32 loop = 0;
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ WORD32 trans_size = TRANS_SIZE_16;
+
+
+
+
+ /* Following 3 instructions replicates the value in the */
+ /* lower 16 bits of m_add_iq in the entire register */
+
+ /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+
+ zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
+ zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
+ zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
+
+ zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
+ zero_last8_rows_stg2 = zero_last8_cols_stg1;
+ if(zero_last8_cols_stg1)
+ {
+ loop = 1;
+ }
+ else
+ loop = 2;
+
+ /* i = 0 => lower 8 samples */
+ /* i = 1 => higher 8 samples */
+ for(i = 0; i < loop; i++)
+ {
+ {
+ WORD32 sample_half_index = i << 3;
+ WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
+ WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+
+
+
+
+ /* If last 12 rows are zero : Rishab */
+ if(zero_last12_rows_stg1)
+ {
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+ }
+
+ /* eo */
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+ }
+ /* If last 8 rows are zero : Rishab */
+ else if(zero_last8_rows_stg1)
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+ }
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+ } /* If all the rows are non-zero : Rishab */
+ else
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+ {
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+ }
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+ }
+
+ }
+ }
+
+ {
+ WORD32 sample_half_index = i << 3;
+ WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ }
+
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = trans_size << 1;
+
+ if(zero_last12_rows_stg1)
+ {
+ for(j = 0; j < 2; j++)
+ {
+ if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ }
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+ }
+ }
+ else if(zero_last8_rows_stg1)
+ {
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ }
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+ }
+
+ }
+ else
+ {
+
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+ }
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+ }
+ }
+ }
+
+ /* Transpose */
+ {
+ WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
+ WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = (trans_size << 1);
+ WORD32 j;
+
+ for(j = 0; j < 2; j++)
+ {
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+ pi2_src_scratch += in_stride;
+ m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+ pi2_src_scratch += in_stride;
+ m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+ pi2_src_scratch += 8;
+ m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+ pi2_src_scratch += 8;
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+ m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+ m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+ m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+ m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+ m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+ m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+ m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+ m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+ m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+
+ m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+ m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+
+ m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+ m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
+
+ m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
+ m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
+ pi2_dst_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
+ pi2_dst_scratch += 8;
+ }
+ }
+ }
+
+ if(zero_last8_cols_stg1)
+ {
+ WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 j;
+
+ m_temp_reg_40 = _mm_setzero_si128();
+ for(j = 0; j < 2; j++)
+ {
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += 8;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch -= out_stride;
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
+ pi2_dst_scratch += 8;
+ }
+ }
+
+
+
+
+ /* Stage 2 */
+ for(i = 0; i < 2; i++)
+ {
+ WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
+ WORD32 stride = (trans_size);
+ MEM_ALIGN16 WORD16 temp_array[256];
+
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ if(zero_last12_rows_stg2)
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+
+ pi2_src_temp += (stride * 9);
+
+ if(!i)
+ {
+ pi2_src_temp += (stride * 6 + 8);
+ }
+ else
+ {
+ pi2_src_temp += (stride * 2 + 8);
+ }
+
+ pi2_src_temp -= (stride * 9);
+
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+ m_temp_reg_20 = _mm_setzero_si128();
+ m_temp_reg_22 = _mm_setzero_si128();
+
+ m_temp_reg_21 = _mm_setzero_si128();
+ m_temp_reg_23 = _mm_setzero_si128();
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+
+ /* Loading coeff and src for use in next block */
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+ /* */
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
+ }
+
+ /* eo */
+ {
+ WORD16 *pi2_scratch = temp_array;
+ WORD32 out_stride = 8;
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][0-3] stored in pu1_dst[0] */
+ /* e[7][0-3] stored in pu1_dst[1] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+
+ /* e[0][4-7] stored in pu1_dst[2] */
+ /* e[7][4-7] stored in pu1_dst[3] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][0-3] stored in pu1_dst[4] */
+ /* e[6][0-3] stored in pu1_dst[5] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+
+ /* e[1][4-7] stored in pu1_dst[6]*/
+ /* e[6][4-7] stored in pu1_dst[7] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pu1_dst[8]*/
+ /* e[5][0-3] stored in pu1_dst[9] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* e[2][4-7] stored in pu1_dst[10]*/
+ /* e[5][4-7] stored in pu1_dst[11] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pu1_dst[12]*/
+ /* e[4][0-3] stored in pu1_dst[13] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* e[3][4-7] stored in pu1_dst[14]*/
+ /* e[4][4-7] stored in pu1_dst[15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ }
+ }
+ else if(zero_last8_rows_stg2)
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+ pi2_src_temp += (stride * 8);
+
+ if(!i)
+ {
+ pi2_src_temp += (stride * 6 + 8);
+ }
+ else
+ {
+ pi2_src_temp += (stride * 2 + 8);
+ }
+
+ pi2_src_temp -= (stride * 8);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+
+ m_temp_reg_76 = _mm_setzero_si128();
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+
+
+ m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
+
+ m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
+ m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+
+ m_temp_reg_26 = m_temp_reg_24;
+ m_temp_reg_27 = m_temp_reg_25;
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ }
+
+ /* eo */
+ {
+ WORD16 *pi2_scratch = temp_array;
+ WORD32 out_stride = 8;
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+ /* e[0][0-3] stored in pu1_dst[0] */
+ /* e[7][0-3] stored in pu1_dst[1] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pu1_dst[2] */
+ /* e[7][4-7] stored in pu1_dst[3] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pu1_dst[4] */
+ /* e[6][0-3] stored in pu1_dst[5] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pu1_dst[6]*/
+ /* e[6][4-7] stored in pu1_dst[7] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ /* e[2][0-3] stored in pu1_dst[8]*/
+ /* e[5][0-3] stored in pu1_dst[9] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ /* e[2][4-7] stored in pu1_dst[10]*/
+ /* e[5][4-7] stored in pu1_dst[11] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pu1_dst[12]*/
+ /* e[4][0-3] stored in pu1_dst[13] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* e[3][4-7] stored in pu1_dst[14]*/
+ /* e[4][4-7] stored in pu1_dst[15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+ }
+ }
+
+ else
+ {
+ /* eeo */
+ /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
+ pi2_src_temp += (stride * 7);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
+ pi2_src_temp += (stride);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
+ if(!i)
+ {
+ pi2_src_temp += (stride * 6 + 8);
+ }
+ else
+ {
+ pi2_src_temp += (stride * 2 + 8);
+ }
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
+ pi2_src_temp -= (stride);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
+ pi2_src_temp -= (stride * 7);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ }
+
+ /* eee */
+ /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
+ /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
+ {
+ /* Loading coeff and src for use in next block */
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
+
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
+
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
+
+ }
+
+ /* eo */
+ {
+ WORD16 *pi2_scratch = temp_array;
+ WORD32 out_stride = 8;
+
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
+
+
+ /* e[0][0-3] stored in pi2_tmp[0][0-7] */
+ /* e[7][0-3] stored in pi2_tmp[0][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+
+ }
+
+ /* eo0[4-7] */
+ {
+
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
+
+ /* e[0][4-7] stored in pi2_tmp[1][0-7] */
+ /* e[7][4-7] stored in pi2_tmp[1][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
+
+ }
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
+
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += out_stride;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += out_stride;
+ }
+ }
+ }
+
+ if(zero_last12_rows_stg2)
+ {
+ /* o & stage 2 pre-transposed out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = temp_array;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+ WORD32 out_stride = (trans_size);
+ WORD32 in_stride = (8) * 4;
+
+ pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+
+ pi2_src_temp += (stride * 9);
+
+ if(0 == i)
+ {
+ pi2_src_temp -= (stride * 2 - 8);
+ }
+ else
+ {
+ pi2_src_temp -= (stride * 6 - 8);
+ }
+ pi2_src_temp -= (stride * 9);
+
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ }
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+
+ }
+ }
+ }
+ else if(zero_last8_rows_stg2)
+ {
+ /* o & stage 2 pre-transposed out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = temp_array;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+ WORD32 out_stride = (trans_size);
+ WORD32 in_stride = (8) * 4;
+
+ pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+ pi2_src_temp += (stride * 8);
+
+ if(0 == i)
+ {
+ pi2_src_temp -= (stride * 2 - 8);
+ }
+ else
+ {
+ pi2_src_temp -= (stride * 6 - 8);
+ }
+
+ pi2_src_temp -= (stride * 8);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ }
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+ }
+ }
+ }
+ else
+ {
+ /* o & stage 2 pre-transposed out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = temp_array;
+ WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
+ WORD32 out_stride = (trans_size);
+ WORD32 in_stride = (8) * 4;
+
+ pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
+ pi2_src_temp += (stride);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
+ pi2_src_temp += (stride * 7);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
+ pi2_src_temp += (stride);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
+ if(0 == i)
+ {
+ pi2_src_temp -= (stride * 2 - 8);
+ }
+ else
+ {
+ pi2_src_temp -= (stride * 6 - 8);
+ }
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
+ pi2_src_temp -= (stride);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
+ pi2_src_temp -= (stride * 7);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
+ pi2_src_temp -= (stride);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
+
+
+ for(j = 0; j < 2; j++)
+ {
+
+ if(j) //H8B= higher 8 bytes L8B lower 8 bytes
+ {
+ m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
+ m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
+ m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
+ m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
+ }
+ else
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
+ }
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
+
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += ((!i) * out_stride + 8);
+ }
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
+
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
+
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += (i * out_stride + 8);
+ }
+
+ }
+ }
+ }
+ }
+
+ /* Transpose */
+ {
+ WORD16 *pi2_src_scratch;
+ UWORD8 *pu1_pred_temp = pu1_pred;
+ WORD32 out_stride = dst_strd;
+ WORD32 in_stride = trans_size;
+ WORD32 j;
+ m_temp_reg_1 = _mm_setzero_si128();
+ for(i = 0; i < 2; i++)
+ {
+ pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
+
+ for(j = 0; j < 2; j++)
+ {
+ m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
+ pi2_src_scratch += ((!i) * in_stride + 8);
+ m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
+ pi2_src_scratch += (in_stride);
+ m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
+ pi2_src_scratch += (i * in_stride + 8);
+ m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
+ pi2_src_scratch += in_stride;
+ m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
+ pi2_src_scratch += ((!i) * in_stride + 8);
+ m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
+ pi2_src_scratch += in_stride;
+ m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
+ pi2_src_scratch += (i * in_stride + 8);
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
+
+
+ m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
+ m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
+
+ m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
+ m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
+
+ m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
+ m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
+
+ m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
+ m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+
+ m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
+ m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
+ m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+
+ m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+ m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
+ m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+
+ m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
+
+ m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
+ m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
+ m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
+
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred_temp += pred_strd;
+ }
+ }
+ }
+}
diff --git a/common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c b/common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c
new file mode 100644
index 0000000..1883758
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c
@@ -0,0 +1,6628 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans_recon_32x32_atom_intr.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * 100470
+ *
+ * @par List of Functions:
+ * - ihevc_iquant_itrans_recon_32x32_ssse3()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+
+
+
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#include <tmmintrin.h>
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 16x16 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 16x16 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 16x16 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 16x16 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 16x16 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/**/
+
+void ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ /* Inverse Transform */
+
+ WORD32 j;
+
+
+ WORD16 *pi2_tmp_orig;
+
+
+ /*MEM_ALIGN16 WORD32 temp_array[1024];
+ MEM_ALIGN16 WORD16 temp1_array[1024];*/
+ WORD16 *o_temp_ptr;
+ WORD16 *temp_ptr;
+
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_16;
+ __m128i m_temp_reg_17;
+ __m128i m_temp_reg_18;
+ __m128i m_temp_reg_19;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_temp_reg_37;
+ __m128i m_temp_reg_40;
+ __m128i m_temp_reg_41;
+ __m128i m_temp_reg_42;
+ __m128i m_temp_reg_43;
+ __m128i m_temp_reg_44;
+ __m128i m_temp_reg_45;
+ __m128i m_temp_reg_46;
+ __m128i m_temp_reg_47;
+
+ __m128i m_temp_reg_70;
+ __m128i m_temp_reg_71;
+ __m128i m_temp_reg_72;
+ __m128i m_temp_reg_73;
+ __m128i m_temp_reg_74;
+ __m128i m_temp_reg_75;
+ __m128i m_temp_reg_76;
+ __m128i m_temp_reg_77;
+
+ __m128i m_temp_reg_80;
+ __m128i m_temp_reg_81;
+ __m128i m_temp_reg_82;
+ __m128i m_temp_reg_83;
+ __m128i m_temp_reg_84;
+ __m128i m_temp_reg_85;
+ __m128i m_temp_reg_86;
+ __m128i m_temp_reg_87;
+
+ __m128i m_temp_reg_90;
+ __m128i m_temp_reg_91;
+ __m128i m_temp_reg_92;
+ __m128i m_temp_reg_93;
+ __m128i m_temp_reg_94;
+ __m128i m_temp_reg_95;
+ __m128i m_temp_reg_96;
+ __m128i m_temp_reg_97;
+
+ __m128i m_rdng_factor;
+ __m128i m_count;
+ __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+ __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
+
+ __m128i temp1, temp2, temp3, temp4;
+ __m128i temp5, temp6, temp7, temp8;
+
+ __m128i all_zero_reg;
+ WORD32 i;
+
+ /*Lokesh*/
+ WORD32 zero_last24_cols_stg1;
+ WORD32 zero_last24_rows_stg1;
+ WORD32 zero_last28_rows_stg1;
+
+ WORD32 zero_last28_rows_stg2;
+ WORD32 zero_last24_rows_stg2;
+
+ WORD32 trans_size_stg1;
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ WORD32 trans_size = TRANS_SIZE_32;
+
+
+ /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
+ zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+ zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
+ zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+
+ zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
+ zero_last24_rows_stg2 = zero_last24_cols_stg1;
+
+ if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
+ {
+ trans_size_stg1 = 8;
+
+ }
+ else
+ {
+ trans_size_stg1 = 32;
+ }
+
+ all_zero_reg = _mm_setzero_si128();
+
+ o_temp_ptr = pi2_tmp;
+ temp_ptr = (pi2_tmp + 1024);
+
+ pi2_tmp += 2048;
+ pi2_tmp_orig = pi2_tmp;
+
+ for(i = 0; i < trans_size_stg1; i += 8)
+ {
+
+
+ {
+ WORD16 *pi2_tmp_src = pi2_src;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+
+ m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ }
+
+ if(zero_last28_rows_stg1)
+ {
+ /* eeo */
+ /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+ {
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20 */
+/* eeeo[1]= m_temp_reg_21 */
+/* eeee[0]= m_temp_reg_22 */
+/* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+/* eeeo[0]= m_temp_reg_20 */
+/* eeeo[1]= m_temp_reg_21 */
+/* eeee[0]= m_temp_reg_22 */
+/* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_44 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_47 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+ }
+ /* eo */
+ {
+ WORD16 *pi2_scratch = o_temp_ptr;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+ //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71);
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71);
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /**************************************************************************/
+
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo4[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /***********************************************************************/
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo5[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo6[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ }
+ }
+ else if(zero_last24_rows_stg1)
+ {
+ {
+ /* eeo */
+ /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* for row 4 to 7 */
+
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_44 = m_temp_reg_14;
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_47 = m_temp_reg_14;
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
+
+
+ /* eeo[] */
+ /* for(k = 0; k < 4; k++) */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+ m_temp_reg_33 = _mm_setzero_si128();
+
+ /* eeo */
+ {
+ /* eeo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_90 = m_temp_reg_34;
+ m_temp_reg_97 = m_temp_reg_35;
+ }
+ /* eeo0[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ m_temp_reg_91 = m_temp_reg_34;
+ m_temp_reg_96 = m_temp_reg_35;
+
+ }
+
+ /* eeo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ /* e[1][0-3] stored in pi2_tmp[2][0-7] */
+ /* e[6][0-3] stored in pi2_tmp[2][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ m_temp_reg_92 = m_temp_reg_34;
+ m_temp_reg_95 = m_temp_reg_35;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
+
+ /* e[1][4-7] stored in pi2_tmp[3][0-7] */
+ /* e[6][4-7] stored in pi2_tmp[3][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ m_temp_reg_93 = m_temp_reg_34;
+ m_temp_reg_94 = m_temp_reg_35;
+
+
+ }
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ temp1 = m_temp_reg_34;
+ temp7 = m_temp_reg_35;
+
+ }
+
+ /* eo2[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ temp2 = m_temp_reg_34;
+ temp6 = m_temp_reg_35;
+
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ temp3 = m_temp_reg_34;
+ temp5 = m_temp_reg_35;
+
+ }
+
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+
+ temp4 = m_temp_reg_34;
+ temp8 = m_temp_reg_35;
+
+
+ }
+ /* All values of ee[] array in pi2_temp */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+
+ }
+ }
+ /* eo */
+ {
+ WORD16 *pi2_scratch = o_temp_ptr;
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo0[4-7] */
+ {
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+
+ /* eo2[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo2[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /**************************************************************************/
+
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+
+ /* eo3[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo3[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo4[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /***********************************************************************/
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+
+ /* eo5[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo5[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo6[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+
+ /* eo7[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ }
+
+ }
+ else
+ {
+
+ {
+ /* eeo */
+ /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
+ /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
+
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
+
+ /* for row 4 to 7 */
+
+ m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+ m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+
+ /* Interleaving row 8 and row 24*/
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
+
+
+ /* eeeo[0]= m_temp_reg_20 */
+ /* eeeo[1]= m_temp_reg_21 */
+ /* eeee[0]= m_temp_reg_22 */
+ /* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
+
+
+ // eeo[]
+ /* for(k = 0; k < 4; k++) */
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+ /* eeo */
+ {
+ /* eeo0[0-3] */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ }
+
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+ m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+ m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+ m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+
+ /* eeo0[4-7] */
+ {
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
+ m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
+
+ }
+
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
+
+ /* eeo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
+
+ m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+ m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+ /* eeo1[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
+
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+ }
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
+
+ /* eeo2[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ /* e[2][0-3] stored in pi2_tmp[4][0-7] */
+ /* e[5][0-3] stored in pi2_tmp[4][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
+
+ temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+ /* eeo2[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+ /* e[2][4-7] stored in pi2_tmp[5][0-7] */
+ /* e[5][4-7] stored in pi2_tmp[5][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
+
+ temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
+
+ /* eeo3[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ /* e[3][0-3] stored in pi2_tmp[6][0-7] */
+ /* e[4][0-3] stored in pi2_tmp[6][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
+
+ temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+
+ }
+
+ /* eeo3[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
+
+ /* e[3][4-7] stored in pi2_tmp[7][0-7] */
+ /* e[4][4-7] stored in pi2_tmp[7][8-15] */
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
+ temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
+ temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
+
+ }
+
+
+ /* All values of ee[] array in pi2_temp */
+
+ /* for(k = 0; k < 8; k++) */
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+ }
+ }
+ /* eo */
+ {
+ WORD16 *pi2_scratch = o_temp_ptr;
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+ m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+ m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+ m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+ m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+ m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+ m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /* eo0[4-7] */
+ {
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
+ m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
+
+ /* eo1[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /* eo1[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo2[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+ /**************************************************************************/
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo3[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
+
+ /* eo4[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo4[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ /***********************************************************************/
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo5[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
+
+ /* eo6[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo6[4-7] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
+
+ /* eo7[0-3] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+
+ /* eo7[4-7] */
+ {
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
+
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
+ pi2_scratch += 8;
+ _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
+ pi2_scratch += 8;
+
+ }
+
+ }
+
+ }
+ /* All e[] are done */
+ /****************************/
+
+
+ {
+
+ WORD16 *pi2_tmp_src = pi2_src + src_strd;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+
+ m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ pi2_tmp_src += (src_strd << 1);
+ m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
+ }
+
+ if(zero_last28_rows_stg1)
+ {
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = o_temp_ptr;
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = trans_size;
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ }
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+ /* o1[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+ /* o2[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+ /* o5[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+ /* o7[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ }
+ else if(zero_last24_rows_stg1)
+ {
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = o_temp_ptr;
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = trans_size;
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+ }
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+ /* o0[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+ /* o11[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ }
+ else
+ {
+ /* o & stage 1 out */
+ {
+ WORD32 j;
+ WORD16 *pi2_src_scratch = o_temp_ptr;
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = (trans_size << 1);
+ WORD32 in_stride = trans_size;
+
+
+ for(j = 0; j < 2; j++)
+ {
+ if(j)
+ {
+ m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
+ m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
+ m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
+ m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
+ m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
+ m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
+ m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
+ m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
+
+ m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
+ m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
+ m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
+ m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
+ m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
+ m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
+ m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
+ m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+ temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+ temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+ temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+ temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+
+ /* o8[0-3] */
+ {
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch -= out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ }
+ /* Transpose */
+ {
+ WORD16 *pi2_src_scratch = temp_ptr;
+ WORD16 *pi2_dst_scratch = pi2_tmp;
+ WORD32 in_stride = (trans_size << 1);
+
+ for(j = 0; j < 2; j++)
+ {
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch -= in_stride;
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+ m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+ m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+ m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+ m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+ m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+ m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+ m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+ m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+ /****************/
+
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+ m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+ m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+ m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+ m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+ m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+ m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+ m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+ m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+ /******************/
+
+ m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
+ m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
+
+ m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
+ m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
+
+ m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
+ m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
+
+ m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
+ m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
+
+ m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
+ m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
+
+ m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
+ m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
+
+ m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
+ m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
+
+ m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
+ m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
+
+ pi2_dst_scratch += 4 * trans_size;
+ }
+ }
+ pi2_src += 8;
+// pi2_dequant_coeff +=8;
+ pi2_tmp += 8 * trans_size;
+ zero_cols = zero_cols >> 1;
+ }
+
+ if(trans_size_stg1 != TRANS_SIZE_32)
+ {
+ m_temp_reg_10 = _mm_setzero_si128();
+
+ for(i = trans_size_stg1; i < 32; i += 8)
+ {
+ WORD16 *pi2_dst_scratch = pi2_tmp;
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
+
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
+ _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
+
+ pi2_tmp += 8 * trans_size;
+ }
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+
+ for(j = 0; j < trans_size; j += 4)
+ {
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ if(zero_last28_rows_stg2)
+ {
+ {
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
+
+ m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+ /* eo1[0-3] */
+ {
+ m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ }
+ /* eo2[0-3] */
+ {
+ m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ }
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+ }
+ /* eo4[0-3] */
+ {
+ m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
+ }
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
+ }
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
+ }
+ /* eo7[0-3] */
+ {
+ m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
+ }
+ }
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ /* e[]*/
+
+ temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[0] */
+ temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[15] */
+
+ temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[1] */
+ temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[14] */
+
+ temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[2] */
+ temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[13] */
+
+ temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[3] */
+ temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[12] */
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[4] */
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[11] */
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[5] */
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[10] */
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[6] */
+ m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[9] */
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[7] */
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[8] */
+
+ /*o[k]*/
+ {
+
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+
+ }
+ else if(zero_last24_rows_stg2)
+ {
+ /* eo */
+ {
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+
+ m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+ m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+
+ /* eo3[0-3] */
+ {
+
+ m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+ /* eo6[0-3] */
+ {
+ m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+ /* eo7[0-3] */
+ {
+ m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ }
+
+ /* eeo */
+ {
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
+
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
+
+ /* eeo0[0-3] */
+ {
+ temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+
+ }
+
+ /* eeo1[0-3] */
+ {
+ temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
+
+ }
+
+ /* eo2[0-3] */
+ {
+ temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
+
+ }
+
+
+ /* eo3[0-3] */
+ {
+ temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+
+ }
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+
+ //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
+
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1); /* ee[0] */
+ m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1); /* ee[7] */
+
+ m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2); /* ee[1] */
+ m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2); /* ee[6] */
+
+ m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3); /* ee[2] */
+ m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3); /* ee[5] */
+
+ m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4); /* ee[3] */
+ m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4); /* ee[4] */
+
+ /* e[]*/
+
+ temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
+ temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
+
+ temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
+ temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
+
+ temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
+ temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
+
+ temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
+ temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
+ m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
+
+ /*o[k] */
+ {
+
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
+
+ m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+ }
+ else
+ {
+ /* eo */
+ {
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
+
+
+ m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
+ m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
+ m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
+ m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
+ m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
+ m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
+ m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
+
+ /* eo0[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
+
+ /* eo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
+
+
+ /* eo4[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
+
+ /* eo5[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
+
+ /* eo6[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
+
+ /* eo7[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
+
+ m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
+
+
+ }
+
+ }
+
+ /* eeo */
+ {
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
+
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
+ m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
+ m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
+
+ /* eeo0[0-3] */
+ {
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
+
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+
+ temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
+
+ /* eeo1[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
+
+ /* eo2[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
+
+ /* eo3[0-3] */
+ {
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
+
+ temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ }
+
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
+
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
+ m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
+ m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
+
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
+
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
+
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
+
+/* eeeo[0]= m_temp_reg_20 */
+/* eeeo[1]= m_temp_reg_21 */
+/* eeee[0]= m_temp_reg_22 */
+/* eeee[1]= m_temp_reg_23 */
+
+ /* eee[0] = eeee[0] + eeeo[0]; */
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
+
+ /* eee[3] = eeee[0] - eeeo[0]; */
+ m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
+
+ /* eee[2] = eeee[1] - eeeo[1]; */
+ m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
+
+ /* eee[1] = eeee[1] + eeeo[1];*/
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
+
+ m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1); /* ee[0] */
+ m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1); /* ee[7] */
+
+ m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2); /* ee[1] */
+ m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2); /* ee[6] */
+
+ m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3); /* ee[2] */
+ m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3); /* ee[5] */
+
+ m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4); /* ee[3] */
+ m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4); /* ee[4] */
+
+/* e[]*/
+
+ temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
+ temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
+
+ temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
+ temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
+
+ temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
+ temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
+
+ temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
+ temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
+
+ m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
+ m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
+
+ m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
+ m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
+
+ m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
+ m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
+
+ m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
+ m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
+
+/*o[k] */
+ {
+
+ WORD16 *pi2_dst_scratch = temp_ptr;
+ WORD32 out_stride = 8;
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
+
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
+
+ m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
+ m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
+ m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
+ m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
+ m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
+ m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
+ m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
+ m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
+ m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
+ m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
+ m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
+ m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
+
+ /* o0[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
+ m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
+
+ /* o1[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
+
+ /* o2[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
+
+ /* o3[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
+
+ /* o4[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
+
+ /* o5[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
+
+ /* o6[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
+
+ /* o7[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
+
+ /* o8[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
+
+ /* o9[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
+
+ /* o10[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
+
+ /* o11[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
+
+ /* o12[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
+
+ /* o13[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
+
+ /* o14[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += out_stride;
+
+ }
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
+ m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
+ m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
+ m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
+ m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
+
+ /* o15[0-3] */
+ {
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
+
+ m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
+ m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
+ m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
+ m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
+
+ m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
+ m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_count = _mm_cvtsi32_si128(i4_shift);
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
+ m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
+
+ m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
+
+ _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
+ pi2_dst_scratch += 8;
+ }
+
+ }
+
+ }
+
+ /* Transpose */
+ {
+
+ WORD16 *pi2_src_scratch = temp_ptr;
+ WORD32 out_stride = dst_strd;
+ WORD32 in_stride = 8;
+
+ m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += in_stride;
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
+ pi2_src_scratch += 8;
+
+
+ m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
+
+ m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
+ m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
+
+ m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
+
+ m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
+ m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
+
+ m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
+ m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
+
+ m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
+ m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
+
+ m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
+ m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
+
+ m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
+ m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
+
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
+
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
+
+ m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
+ m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
+
+ m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
+ m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
+
+ m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
+ m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
+
+ m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
+ m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
+
+
+ m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); // row0 = 0-7
+ m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); // row1 = 0-7
+
+ m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); // row0=24-31
+ m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); // row1=24-31
+
+ m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); // row0=8-15
+ m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); // row1=8-15
+
+ m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); // row0=16-23
+ m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); // row1=16-23
+
+ m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); // row2 =0-7
+ m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); // row3 =0-7
+
+ m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); // row2=24-31
+ m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); // row3=24-31
+
+ m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); // row2=8-15
+ m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); // row3=8-15
+
+ m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); // row2=16-23
+ m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); // row3=16-23
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
+
+ m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
+
+ m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
+ m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
+
+ m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
+ m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
+
+ _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
+ pu1_dst += out_stride;
+ pu1_pred += pred_strd;
+
+ }
+ pi2_tmp += 4;
+ }
+}
+
+
diff --git a/common/x86/ihevc_itrans_recon_sse42_intr.c b/common/x86/ihevc_itrans_recon_sse42_intr.c
new file mode 100644
index 0000000..b472486
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_sse42_intr.c
@@ -0,0 +1,2503 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans_recon_x86_intr.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * 100470
+ * 100592 (edited by)
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_4x4_ttype1_sse42()
+ * - ihevc_itrans_recon_4x4_sse42()
+ * - ihevc_itrans_recon_8x8_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_macros.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform
+ * type1(DST) and reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform type 1 and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_recon_4x4_ttype1_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_coeff1, m_coeff2, m_coeff3;
+ __m128i m_rdng_factor;
+ __m128i m_count;
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ UNUSED(zero_rows);
+ UNUSED(zero_cols);
+ UNUSED(pi2_tmp);
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[2][0]); //74
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+ m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+ m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+ m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+ m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
+
+ /* c[4] in m_temp_reg_14 */
+ /* c[4] = src[0] - src[2] + src[3] */
+ {
+ m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
+ }
+
+ /* c[3] in m_temp_reg_13 */
+ {
+ m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
+ }
+
+ /* c[0] in m_temp_reg_10 */
+ {
+ m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
+ }
+
+ /* c[1] in m_temp_reg_11 */
+ {
+ m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
+ }
+
+ /* c[2] in m_temp_reg_12 */
+ {
+ m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
+ }
+
+ /* c[4] in m_temp_reg_14 */
+ /* c[4] = src[0] - src[2] + src[3] */
+ {
+ m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
+ }
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[1][0]); //29
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[0][0]); //55
+
+ /* Stage 1 outputs stored in m_temp_reg_20-23 */
+ {
+ m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1); //29*c0
+ m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2); //55*c1
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1); //29*c1
+ m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2); //55*c2
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2); //55*c0
+ m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1); //29*c2
+ m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3); //74*c4
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+ m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+
+ }
+
+ /* Stage 2 */
+ {
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+ m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+ m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+ m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
+
+ /* c[4] stored in m_temp_reg_4 */
+ {
+ m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ }
+
+ /* c[3] stored in m_temp_reg_3 */
+ {
+ m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
+ }
+
+ /* c[0] stored in m_temp_reg_0 */
+ {
+ m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ }
+
+ /* c[1] stored in m_temp_reg_1 */
+ {
+ m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
+ }
+
+ /* c[2] stored in m_temp_reg_2 */
+ {
+ m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
+ }
+
+ /* c[4] stored in m_temp_reg_4 */
+ {
+ m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
+ }
+
+ /* Stage 2 output generation */
+ {
+ m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); //29*c0
+ m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2); //55*c1
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //29*c1
+ m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); //55*c2
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); //55*c0
+ m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); //29*c2
+ m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3); //74*c4
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+ m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+ m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+ }
+
+ /* Recon and store */
+ {
+ WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+ m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+ m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+ m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
+ m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+ m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+ m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+ m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+ m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+ m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+ m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+ m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+ }
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform
+ * (DCT) and reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+
+
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_coeff1, m_coeff3;
+ __m128i m_rdng_factor;
+ __m128i m_count;
+
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ UNUSED(zero_rows);
+ UNUSED(zero_cols);
+ UNUSED(pi2_tmp);
+
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+ m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+ m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+ m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+ m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[0][0]); //36
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[2][0]); //83
+
+ /* e */
+ {
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
+ }
+
+ /* o */
+ {
+ m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //src[1]*36
+ m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3); //src[3]*83
+ m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); //src[1]*83
+ m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1); //src[3]*36
+ }
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ /* e1 stored in m_temp_reg_31 */
+ {
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ /* e0 stored in m_temp_reg_30 */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ /* o1 stored in m_temp_reg_33 */
+ {
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+ }
+
+ /* e1 + add */
+ {
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ }
+
+ /* e0 + add */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ }
+
+ /* o0 stored in m_temp_reg_34 */
+ {
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+ }
+
+ /* Stage 1 outputs */
+ {
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+ }
+
+ /* Stage 2 */
+ {
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+ m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+
+ m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+ m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
+
+ /* e */
+ {
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
+ }
+
+ /* o */
+ {
+ m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1); //src[1]*36
+ m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); //src[1]*83
+ m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3); //src[3]*83
+ m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1); //src[3]*36
+ }
+
+ /* e */
+ {
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
+ }
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ /* e1 stored in m_temp_reg_31 */
+ {
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ /* e0 stored in m_temp_reg_30 */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ /* o1 stored in m_temp_reg_33 */
+ {
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+ }
+
+ /* e1 + add */
+ {
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ }
+
+ /* e0 + add */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ }
+
+ /* o0 stored in m_temp_reg_34 */
+ {
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+ }
+
+ /* Stage 2 outputs */
+ {
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+ }
+
+ /* Recon and store */
+ {
+ UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+ m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+ m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+ m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+ m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
+ m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+ m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+ m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+ m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+ m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+ m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+ m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+ pu1_dst += dst_strd;
+ pu4_dst = (UWORD32 *)(pu1_dst);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+ pu1_dst += dst_strd;
+ pu4_dst = (UWORD32 *)(pu1_dst);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+ pu1_dst += dst_strd;
+ pu4_dst = (UWORD32 *)(pu1_dst);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+ }
+ }
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 8c8 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 8x8 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_recon_8x8_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_16;
+ __m128i m_temp_reg_17;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_26;
+ __m128i m_temp_reg_27;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_temp_reg_37;
+ __m128i m_temp_reg_40;
+ __m128i m_temp_reg_41;
+ __m128i m_temp_reg_42;
+ __m128i m_temp_reg_43;
+ __m128i m_temp_reg_44;
+ __m128i m_temp_reg_45;
+ __m128i m_temp_reg_46;
+ __m128i m_temp_reg_47;
+ __m128i m_temp_reg_50;
+ __m128i m_temp_reg_51;
+ __m128i m_temp_reg_52;
+ __m128i m_temp_reg_53;
+ __m128i m_temp_reg_54;
+ __m128i m_temp_reg_55;
+ __m128i m_temp_reg_56;
+ __m128i m_temp_reg_57;
+ __m128i m_temp_reg_60;
+ __m128i m_temp_reg_61;
+ __m128i m_temp_reg_62;
+ __m128i m_temp_reg_63;
+ __m128i m_temp_reg_64;
+ __m128i m_temp_reg_65;
+ __m128i m_temp_reg_66;
+ __m128i m_temp_reg_67;
+ __m128i m_temp_reg_70;
+ __m128i m_temp_reg_71;
+ __m128i m_temp_reg_72;
+ __m128i m_temp_reg_73;
+ __m128i m_temp_reg_74;
+ __m128i m_temp_reg_75;
+ __m128i m_temp_reg_76;
+ __m128i m_temp_reg_77;
+ __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+
+ WORD32 check_row_stage_1; /* Lokesh */
+ WORD32 check_row_stage_2; /* Lokesh */
+
+ __m128i m_rdng_factor;
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ UNUSED(pi2_tmp);
+ check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
+ check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
+
+ if(!check_row_stage_2)
+ {
+ if(!check_row_stage_1)
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ //o0:1B*89+3B*75,5B*50+7B*18
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_setzero_si128();
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o1:1B*75-3B*18,5B*89+7B*50
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o2:1B*50-3B*89,5B*18+7B*75
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o3:1B*18-3B*50,5B*75-7B*89
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_setzero_si128();
+ m_temp_reg_55 = _mm_setzero_si128();
+ m_temp_reg_56 = _mm_setzero_si128();
+ m_temp_reg_57 = _mm_setzero_si128();
+ }
+ }
+ else
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ //o0:1B*89+3B*75,5B*50+7B*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_setzero_si128();
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o1:1B*75-3B*18,5B*89+7B*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o2:1B*50-3B*89,5B*18+7B*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o3:1B*18-3B*50,5B*75-7B*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_setzero_si128();
+ m_temp_reg_55 = _mm_setzero_si128();
+ m_temp_reg_56 = _mm_setzero_si128();
+ m_temp_reg_57 = _mm_setzero_si128();
+ }
+ }
+
+ /* Stage 2 */
+ i4_shift = IT_SHIFT_STAGE_2;
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ /* Loading coeff for computing o0 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+ //o0:1B*89+3B*75,1T*89+3T*75
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+ /* Loading coeff for computing o1 in the next block */
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o1:1B*75-3B*18,1T*75-3T*18
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+ /* Loading coeff for computing o2 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o2:1B*50-3B*89,5T*18+7T*75.
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o3:1B*18-3B*50,1T*18-3T*50
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+ m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+ m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+ m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+
+ /* Recon and store */
+ {
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+ m_temp_reg_50 = _mm_setzero_si128();
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+ m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+ m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+ m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+ m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+ m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+ m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+ m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+ m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+ m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+ m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+ m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+ m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+ m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+ m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+ m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+ pu1_dst += dst_strd;
+ }
+ }
+ }
+ else
+
+ {
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+#if 1
+ if(!check_row_stage_1)
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ //o0:1B*89+3B*75,1T*89+3T*75
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+ }
+ else
+ {
+#endif
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+ //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+ }
+ /* Stage 2 */
+
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ {
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ /* Loading coeff for computing o0 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+ //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+ /* Loading coeff for computing o1 in the next block */
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+ /* Loading coeff for computing o2 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+ m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+ m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+ m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+
+ /* Recon and store */
+ {
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+
+ m_temp_reg_50 = _mm_setzero_si128();
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+ m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+ m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+ m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+ m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+ m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+ m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+ m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+ m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+ m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+ m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+ m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+ m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+ m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+ m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+ m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+ pu1_dst += dst_strd;
+
+ }
+
+
+ }
+
+
+ }
+}
diff --git a/common/x86/ihevc_itrans_recon_ssse3_intr.c b/common/x86/ihevc_itrans_recon_ssse3_intr.c
new file mode 100644
index 0000000..960ecdf
--- /dev/null
+++ b/common/x86/ihevc_itrans_recon_ssse3_intr.c
@@ -0,0 +1,2744 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_itrans_recon_atom_intr.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * 100470
+ * 100592 (edited by)
+ *
+ * @par List of Functions:
+ * - ihevc_itrans_recon_4x4_ttype1_ssse3()
+ * - ihevc_itrans_recon_4x4_ssse3()
+ * - ihevc_itrans_recon_8x8_ssse3()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_defs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_trans_tables.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_trans_macros.h"
+
+
+#include <immintrin.h>
+#include <emmintrin.h>
+
+#include <tmmintrin.h>
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform
+ * type1(DST) and reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform type 1 and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_rdng_factor;
+ __m128i m_count;
+
+ __m128i m_ge_zero16b_flag_row0;
+ __m128i m_ge_zero16b_flag_row1;
+ __m128i m_ge_zero16b_flag_row2;
+ __m128i m_ge_zero16b_flag_row3;
+
+ __m128i m_zero = _mm_setzero_si128();
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ UNUSED(zero_cols);
+ UNUSED(zero_rows);
+ UNUSED(pi2_tmp);
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+ m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
+ m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
+ m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
+ m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
+ m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
+ m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
+
+ /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+ m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+ m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+ m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
+
+ /* c[4] in m_temp_reg_14 */
+ /* c[4] = src[0] - src[2] + src[3] */
+ {
+ m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
+ }
+
+ /* c[3] in m_temp_reg_13 */
+ {
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 3);
+ m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_13 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+ //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
+ }
+
+ /* c[0] in m_temp_reg_10 */
+ {
+ m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
+ }
+
+ /* c[1] in m_temp_reg_11 */
+ {
+ m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
+ }
+
+ /* c[2] in m_temp_reg_12 */
+ {
+ m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
+ }
+
+ /* c[4] in m_temp_reg_14 */
+ /* c[4] = src[0] - src[2] + src[3] */
+ {
+ m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
+ }
+
+ /* Stage 1 outputs stored in m_temp_reg_20-23 */
+ {
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 5);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 1);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+ //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1);//29*c0
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 6);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 3);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+ //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2);//55*c1
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 5);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 1);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+ //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1);//29*c1
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 6);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 3);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+ //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2);//55*c2
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 6);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 3);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+ //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2);//55*c0
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 5);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 1);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
+ //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1);//29*c2
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_14, 6);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_14, 3);
+ m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_14, 1);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
+ //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3);//74*c4
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+ m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+
+ }
+
+ /* Stage 2 */
+ {
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+ m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+ m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+ m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
+
+ m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
+ m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
+
+ m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+ m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+ m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+ m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+
+
+ /* c[4] stored in m_temp_reg_4 */
+ {
+ m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
+ }
+
+ /* c[3] stored in m_temp_reg_3 */
+ {
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_22, 6);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_22, 3);
+ m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_22, 1);
+ m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
+ //m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
+ }
+
+ /* c[0] stored in m_temp_reg_0 */
+ {
+ m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ }
+
+ /* c[1] stored in m_temp_reg_1 */
+ {
+ m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
+ }
+
+ /* c[2] stored in m_temp_reg_2 */
+ {
+ m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
+ }
+
+ /* c[4] stored in m_temp_reg_4 */
+ {
+ m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
+ }
+
+ /* Stage 2 output generation */
+ {
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 5);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 1);
+ m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+ //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);//29*c0
+
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 6);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 3);
+ m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+ //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2);//55*c1
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 5);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 1);
+ m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+ //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//29*c1
+
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 6);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 3);
+ m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+ //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);//55*c2
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 3);
+ m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+ //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);//55*c0
+
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 5);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 1);
+ m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
+ m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
+ //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);//29*c2
+
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_4, 6);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_4, 3);
+ m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_4, 1);
+ m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
+ //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3);//74*c4
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
+
+ m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
+
+ m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
+
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+ }
+
+ /* Recon and store */
+ {
+ WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
+
+ /*m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+ m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+ m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);*/
+
+ m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+ m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+ m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+ m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+ m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+ m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+ m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+ m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+ }
+ }
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform
+ * (DCT) and reconstruction for 4x4 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 4x4 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 4x4 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 4x4 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 4x4 block
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void ihevc_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_rdng_factor;
+ __m128i m_count;
+
+ __m128i m_ge_zero16b_flag_row0;
+ __m128i m_ge_zero16b_flag_row1;
+ __m128i m_ge_zero16b_flag_row2;
+ __m128i m_ge_zero16b_flag_row3;
+
+ __m128i m_zero = _mm_setzero_si128();
+
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ UNUSED(zero_rows);
+ UNUSED(zero_cols);
+ UNUSED(pi2_tmp);
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
+
+ m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
+ m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
+ m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
+ m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
+ m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
+ m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
+ m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
+
+ /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
+ m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
+
+ m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+ m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
+
+ /* e */
+ {
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
+ }
+
+ /* o */
+ {
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 5);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 2);
+ m_temp_reg_12 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 6);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 4);
+ m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_3, 1);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_3);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
+ m_temp_reg_13 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
+ //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 4);
+ m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_1);
+ m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
+ m_temp_reg_14 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
+ //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
+
+ m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 5);
+ m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 2);
+ m_temp_reg_15 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
+ //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
+ }
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ /* e1 stored in m_temp_reg_31 */
+ {
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ /* e0 stored in m_temp_reg_30 */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ /* o1 stored in m_temp_reg_33 */
+ {
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+ }
+
+ /* e1 + add */
+ {
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ }
+
+ /* e0 + add */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ }
+
+ /* o0 stored in m_temp_reg_34 */
+ {
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+ }
+
+ /* Stage 1 outputs */
+ {
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+ }
+
+ /* Stage 2 */
+ {
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);*/
+
+ m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
+ m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
+
+ m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+ m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+ m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
+ m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
+
+ /*m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
+ m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
+
+ m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
+ m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
+
+ /* e */
+ {
+ m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
+ }
+
+ /* o */
+ /*{
+ m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1);//src[1]*36
+ m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);//src[1]*83
+ m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3);//src[3]*83
+ m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1);//src[3]*36
+ }*/
+ {
+ m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 5);
+ m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 2);
+ m_temp_reg_12 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
+ //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
+
+ m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 6);
+ m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 4);
+ m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_23, 1);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_23);
+ m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
+ m_temp_reg_13 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
+ //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
+
+ m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 6);
+ m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 4);
+ m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_22, 1);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_22);
+ m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
+ m_temp_reg_14 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
+ //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
+
+ m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 5);
+ m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 2);
+ m_temp_reg_15 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
+ //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
+ }
+
+ /* e */
+ {
+ m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
+ }
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+
+ /* e1 stored in m_temp_reg_31 */
+ {
+ m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
+
+ /* e0 stored in m_temp_reg_30 */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
+ }
+
+ m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
+ m_count = _mm_cvtsi32_si128(i4_shift);
+
+ /* o1 stored in m_temp_reg_33 */
+ {
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
+ }
+
+ /* e1 + add */
+ {
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
+ }
+
+ /* e0 + add */
+ {
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
+ }
+
+ /* o0 stored in m_temp_reg_34 */
+ {
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
+ }
+
+ /* Stage 2 outputs */
+ {
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
+
+ m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
+ m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
+ m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
+ m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
+
+ m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
+ m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
+
+ m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
+ m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
+
+ m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
+ m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
+ }
+
+ /* Recon and store */
+ {
+ UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+ //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
+ //m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
+
+ //m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
+ //m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
+
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
+
+ m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
+ m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
+
+ m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
+ m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
+
+ m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
+ m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
+ m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
+ m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
+ pu1_dst += dst_strd;
+ pu4_dst = (UWORD32 *)(pu1_dst);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+ pu1_dst += dst_strd;
+ pu4_dst = (UWORD32 *)(pu1_dst);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+ pu1_dst += dst_strd;
+ pu4_dst = (UWORD32 *)(pu1_dst);
+
+ *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+ }
+ }
+}
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 8c8 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 8x8 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_16;
+ __m128i m_temp_reg_17;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_26;
+ __m128i m_temp_reg_27;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_temp_reg_37;
+ __m128i m_temp_reg_40;
+ __m128i m_temp_reg_41;
+ __m128i m_temp_reg_42;
+ __m128i m_temp_reg_43;
+ __m128i m_temp_reg_44;
+ __m128i m_temp_reg_45;
+ __m128i m_temp_reg_46;
+ __m128i m_temp_reg_47;
+ __m128i m_temp_reg_50;
+ __m128i m_temp_reg_51;
+ __m128i m_temp_reg_52;
+ __m128i m_temp_reg_53;
+ __m128i m_temp_reg_54;
+ __m128i m_temp_reg_55;
+ __m128i m_temp_reg_56;
+ __m128i m_temp_reg_57;
+ __m128i m_temp_reg_60;
+ __m128i m_temp_reg_61;
+ __m128i m_temp_reg_62;
+ __m128i m_temp_reg_63;
+ __m128i m_temp_reg_64;
+ __m128i m_temp_reg_65;
+ __m128i m_temp_reg_66;
+ __m128i m_temp_reg_67;
+ __m128i m_temp_reg_70;
+ __m128i m_temp_reg_71;
+ __m128i m_temp_reg_72;
+ __m128i m_temp_reg_73;
+ __m128i m_temp_reg_74;
+ __m128i m_temp_reg_75;
+ __m128i m_temp_reg_76;
+ __m128i m_temp_reg_77;
+ __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+
+ WORD32 check_row_stage_1; /* Lokesh */
+ WORD32 check_row_stage_2; /* Lokesh */
+
+ __m128i m_rdng_factor;
+ //__m128i m_count;
+ WORD32 i4_shift = IT_SHIFT_STAGE_1;
+ UNUSED(zero_rows);
+ UNUSED(zero_cols);
+ UNUSED(pi2_tmp);
+
+ check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
+ check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
+
+ m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+
+ m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src);
+
+ if(!check_row_stage_2)
+ {
+ if(!check_row_stage_1)
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+ {
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ //o0:1B*89+3B*75,5B*50+7B*18
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_setzero_si128();
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o1:1B*75-3B*18,5B*89+7B*50
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o2:1B*50-3B*89,5B*18+7B*75
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o3:1B*18-3B*50,5B*75-7B*89
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ */
+ m_temp_reg_54 = _mm_setzero_si128();
+ m_temp_reg_55 = _mm_setzero_si128();
+ m_temp_reg_56 = _mm_setzero_si128();
+ m_temp_reg_57 = _mm_setzero_si128();
+ }
+ }
+ else
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+ {
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ //o0:1B*89+3B*75,5B*50+7B*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_setzero_si128();
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o1:1B*75-3B*18,5B*89+7B*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o2:1B*50-3B*89,5B*18+7B*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o3:1B*18-3B*50,5B*75-7B*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ */
+ m_temp_reg_54 = _mm_setzero_si128();
+ m_temp_reg_55 = _mm_setzero_si128();
+ m_temp_reg_56 = _mm_setzero_si128();
+ m_temp_reg_57 = _mm_setzero_si128();
+ }
+ }
+
+ /* Stage 2 */
+ i4_shift = IT_SHIFT_STAGE_2;
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ /* Loading coeff for computing o0 in the next block */
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ //m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55,m_temp_reg_57);
+ //m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55,m_temp_reg_57);
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+ //o0:1B*89+3B*75,1T*89+3T*75
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+ /* Loading coeff for computing o1 in the next block */
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o1:1B*75-3B*18,1T*75-3T*18
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+ /* Loading coeff for computing o2 in the next block */
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o2:1B*50-3B*89,5T*18+7T*75.
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o3:1B*18-3B*50,1T*18-3T*50
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+ m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+ m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+ m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+
+ /* Recon and store */
+ {
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+ m_temp_reg_50 = _mm_setzero_si128();
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+ m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+ m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+ m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+ m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+ m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+ m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+ m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+ m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+ m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+ m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+ m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+ m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+ m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+ m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+ m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+ pu1_dst += dst_strd;
+ }
+ }
+ }
+ else
+
+ {
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+#if 1
+ if(!check_row_stage_1)
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ // m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75,m_temp_reg_77);
+ // m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75,m_temp_reg_77);
+ //o0:1B*89+3B*75,1T*89+3T*75
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+ //m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+ //m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ //m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ //m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ //m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+ //m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+ //m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+ //m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+
+ //m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ //m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+ }
+ else
+ {
+#endif
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+ //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+ }
+ /* Stage 2 */
+
+ i4_shift = IT_SHIFT_STAGE_2;
+
+ {
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+ //m_temp_reg_66 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);
+ //m_temp_reg_64 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);
+ //m_temp_reg_62 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);
+ //m_temp_reg_60 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ //m_temp_reg_16 = _mm_sub_epi32(m_temp_reg_64, m_temp_reg_66);
+ //m_temp_reg_14 = _mm_add_epi32(m_temp_reg_60, m_temp_reg_62);
+
+ /* Loading coeff for computing o0 in the next block */
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+ /*m_temp_reg_3 = _mm_srli_si128(m_temp_reg_53, 8);
+ m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
+ m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
+ */
+
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+ /* m_temp_reg_4 = _mm_cvtepi16_epi32(m_temp_reg_55);
+ m_temp_reg_5 = _mm_srli_si128(m_temp_reg_55, 8);
+ m_temp_reg_6 = _mm_cvtepi16_epi32(m_temp_reg_57);
+ m_temp_reg_7 = _mm_srli_si128(m_temp_reg_57, 8);
+ m_temp_reg_5 = _mm_cvtepi16_epi32(m_temp_reg_5);
+ m_temp_reg_7 = _mm_cvtepi16_epi32(m_temp_reg_7);
+ */
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+ //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+ /* Loading coeff for computing o1 in the next block */
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+ /* Loading coeff for computing o2 in the next block */
+ m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
+ m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
+ m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+ m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+ m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+ m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+
+ /* Recon and store */
+ {
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+
+ m_temp_reg_50 = _mm_setzero_si128();
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+ m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+ m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+ m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+ m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+ m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+ m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+ m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+ m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+ m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+ m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+ m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+ m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+ m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+ m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+ m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+ pu1_dst += dst_strd;
+
+ }
+
+
+ }
+
+
+ }
+}
+
diff --git a/common/x86/ihevc_mem_fns_ssse3_intr.c b/common/x86/ihevc_mem_fns_ssse3_intr.c
new file mode 100644
index 0000000..ca0b77a
--- /dev/null
+++ b/common/x86/ihevc_mem_fns_ssse3_intr.c
@@ -0,0 +1,168 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_mem_fns_atom_intr.c
+ *
+ * @brief
+ * Functions used for memory operations
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_mem_fns.h"
+
+#include <immintrin.h>
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memcpy of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] pu1_src
+ * UWORD8 pointer to the source
+ *
+ * @param[in] num_bytes
+ * number of bytes to copy
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+
+
+void ihevc_memcpy_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes)
+{
+ int col;
+ for(col = num_bytes; col >= 8; col -= 8)
+ {
+ __m128i src_temp16x8b;
+ src_temp16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
+ pu1_src += 8;
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b);
+ pu1_dst += 8;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 8bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu1_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD8 value used for memset
+ *
+ * @param[in] num_bytes
+ * number of bytes to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_memset_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes)
+{
+ int col;
+ __m128i src_temp16x8b;
+ src_temp16x8b = _mm_set1_epi8(value);
+ for(col = num_bytes; col >= 8; col -= 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b);
+ pu1_dst += 8;
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * memset of 16bit data of a 8,16 or 32 bytes
+ *
+ * @par Description:
+ * Does memset of 16bit data for 8,16 or 32 number of bytes
+ *
+ * @param[in] pu2_dst
+ * UWORD8 pointer to the destination
+ *
+ * @param[in] value
+ * UWORD16 value used for memset
+ *
+ * @param[in] num_words
+ * number of words to set
+ * @returns
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_memset_16bit_mul_8_ssse3(UWORD16 *pu2_dst, UWORD16 value, UWORD32 num_words)
+{
+ int col;
+ __m128i src_temp16x8b;
+ src_temp16x8b = _mm_set1_epi16(value);
+ for(col = num_words; col >= 8; col -= 8)
+ {
+ _mm_storeu_si128((__m128i *)(pu2_dst), src_temp16x8b);
+ pu2_dst += 8;
+ }
+}
+
diff --git a/common/x86/ihevc_padding_ssse3_intr.c b/common/x86/ihevc_padding_ssse3_intr.c
new file mode 100644
index 0000000..42ee5ac
--- /dev/null
+++ b/common/x86/ihevc_padding_ssse3_intr.c
@@ -0,0 +1,334 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_padding_atom_intr.c
+*
+* @brief
+* Contains function definitions for Padding
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_pad_left_luma_ssse3()
+* - ihevc_pad_left_chroma_ssse3()
+* - ihevc_pad_right_luma_ssse3()
+* - ihevc_pad_right_chroma_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include <string.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_debug.h"
+
+#include <immintrin.h>
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_luma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 i;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b;
+
+ const0_16x8b = _mm_setzero_si128();
+
+ ASSERT(pad_size % 8 == 0);
+
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_dst = pu1_src - pad_size;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+ for(i = 0; i < pad_size; i += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + i), src_temp0_16x8b);
+ }
+ pu1_src += src_strd;
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the left of a 2d array
+*
+* @par Description:
+* The left column of a 2d array is replicated for pad_size times at the left
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array (each colour component)
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_left_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b, const1_16x8b;
+ const0_16x8b = _mm_setzero_si128();
+ const1_16x8b = _mm_set1_epi8(1);
+ const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
+
+ ASSERT(pad_size % 8 == 0);
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
+ pu1_dst = pu1_src - pad_size;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+
+ for(col = 0; col < pad_size; col += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+ }
+ pu1_src += src_strd;
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (luma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_luma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b;
+
+ ASSERT(pad_size % 8 == 0);
+
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 1));
+ const0_16x8b = _mm_setzero_si128();
+ pu1_dst = pu1_src;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+ for(col = 0; col < pad_size; col += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+ }
+ pu1_src += src_strd;
+ }
+
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Padding (chroma block) at the right of a 2d array
+*
+* @par Description:
+* The right column of a 2d array is replicated for pad_size times at the right
+*
+*
+* @param[in] pu1_src
+* UWORD8 pointer to the source
+*
+* @param[in] src_strd
+* integer source stride
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array (each colour component)
+*
+* @param[in] pad_size
+* integer -padding size of the array
+*
+* @param[in] ht
+* integer height of the array
+*
+* @param[in] wd
+* integer width of the array
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_pad_right_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 ht,
+ WORD32 pad_size)
+{
+ WORD32 row;
+ WORD32 col;
+ UWORD8 *pu1_dst;
+ __m128i const0_16x8b, const1_16x8b;
+ const0_16x8b = _mm_setzero_si128();
+ const1_16x8b = _mm_set1_epi8(1);
+ const0_16x8b = _mm_unpacklo_epi8(const0_16x8b, const1_16x8b);
+
+ ASSERT(pad_size % 8 == 0);
+
+ for(row = 0; row < ht; row++)
+ {
+ __m128i src_temp0_16x8b;
+
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2));
+ pu1_dst = pu1_src;
+ src_temp0_16x8b = _mm_shuffle_epi8(src_temp0_16x8b, const0_16x8b);
+ for(col = 0; col < pad_size; col += 8)
+ {
+ _mm_storel_epi64((__m128i *)(pu1_dst + col), src_temp0_16x8b);
+ }
+
+ pu1_src += src_strd;
+ }
+}
+
diff --git a/common/x86/ihevc_platform_macros.h b/common/x86/ihevc_platform_macros.h
new file mode 100644
index 0000000..ae688e6
--- /dev/null
+++ b/common/x86/ihevc_platform_macros.h
@@ -0,0 +1,118 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_platform_macros.h
+*
+* @brief
+* Platform specific Macro definitions used in the codec
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IHEVC_PLATFORM_MACROS_H_
+#define _IHEVC_PLATFORM_MACROS_H_
+
+//#include <immintrin.h>
+
+
+#define CLIP_U8(x) CLIP3((x), 0, 255)
+#define CLIP_S8(x) CLIP3((x), -128, 127)
+
+#define CLIP_U10(x) CLIP3((x), 0, 1023);
+#define CLIP_S10(x) CLIP3((x), -512, 511);
+
+#define CLIP_U12(x) CLIP3((x), 0, 4095);
+#define CLIP_S12(x) CLIP3((x), -2048, 2047);
+
+#define CLIP_U16(x) CLIP3((x), 0, 65535)
+#define CLIP_S16(x) CLIP3((x), -32768, 32767)
+
+
+
+#define SHL(x,y) (((y) < 32) ? ((x) << (y)) : 0)
+#define SHR(x,y) (((y) < 32) ? ((x) >> (y)) : 0)
+
+#define SHR_NEG(val,shift) ((shift>0)?(val>>shift):(val<<(-shift)))
+#define SHL_NEG(val,shift) ((shift<0)?(val>>(-shift)):(val<<shift))
+
+
+#define ITT_BIG_ENDIAN(x) ((x << 24)) | \
+ ((x & 0x0000ff00) << 8) | \
+ ((x & 0x00ff0000) >> 8) | \
+ ((UWORD32)x >> 24);
+
+
+#define NOP(nop_cnt) {UWORD32 nop_i; for (nop_i = 0; nop_i < nop_cnt; nop_i++);}
+
+#define PLD(a)
+#define INLINE inline
+
+static INLINE UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+
+static INLINE UWORD32 CTZ(UWORD32 u4_word)
+{
+ if(0 == u4_word)
+ return 31;
+ else
+ {
+ unsigned int index;
+ index = __builtin_ctz(u4_word);
+ return (UWORD32)index;
+ }
+}
+
+#define GCC_ENABLE 1
+
+#if GCC_ENABLE
+#define _mm256_loadu2_m128i(X,Y) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((Y))), _mm_loadu_si128((X)),1);
+
+#define _mm256_storeu2_m128i(X,Y,Z) {_mm_storeu_si128 ((Y), _mm256_castsi256_si128((Z)));_mm_storeu_si128 ((X), _mm256_extracti128_si256((Z),1));}
+
+#define _mm256_set_m128i(X,Y) _mm256_insertf128_si256(_mm256_castsi128_si256((Y)),(X),1);
+
+#endif
+
+
+#define PREFETCH_ENABLE 1
+
+#if PREFETCH_ENABLE
+#define PREFETCH(ptr, type) _mm_prefetch(ptr, type);
+#else
+#define PREFETCH(ptr, type)
+#endif
+
+#define MEM_ALIGN8 __attribute__ ((aligned (8)))
+#define MEM_ALIGN16 __attribute__ ((aligned (16)))
+#define MEM_ALIGN32 __attribute__ ((aligned (32)))
+
+#endif /* _IHEVC_PLATFORM_MACROS_H_ */
diff --git a/common/x86/ihevc_sao_ssse3_intr.c b/common/x86/ihevc_sao_ssse3_intr.c
new file mode 100644
index 0000000..cffd2a9
--- /dev/null
+++ b/common/x86/ihevc_sao_ssse3_intr.c
@@ -0,0 +1,5653 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_sao_atom_intr.c
+*
+* @brief
+* Contains function definitions for Sample adaptive offset(SAO) used in-loop
+* filtering
+*
+* @author
+* 100592
+*
+* @par List of Functions:
+* - ihevc_sao_band_offset_luma_ssse3()
+* - ihevc_sao_band_offset_chroma_ssse3()
+* - ihevc_sao_edge_offset_class0_ssse3()
+* - ihevc_sao_edge_offset_class0_chroma_ssse3()
+* - ihevc_sao_edge_offset_class1_ssse3()
+* - ihevc_sao_edge_offset_class1_chroma_ssse3()
+* - ihevc_sao_edge_offset_class2_ssse3()
+* - ihevc_sao_edge_offset_class2_chroma_ssse3()
+* - ihevc_sao_edge_offset_class3_ssse3()
+* - ihevc_sao_edge_offset_class3_chroma_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_tables_x86_intr.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_sao.h"
+
+#include <immintrin.h>
+
+#define NUM_BAND_TABLE 32
+/**
+*******************************************************************************
+*
+* @brief
+* Has two sets of functions : band offset and edge offset both for luma and chroma
+* edge offset has horizontal ,vertical, 135 degree and 45 degree
+*
+* @par Description:
+*
+*
+* @param[in-out] pu1_src
+* Pointer to the source
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in-out] pu1_src_left
+* source left boundary
+*
+* @param[in-out] pu1_src_top
+* Source top boundary
+*
+* @param[in-out] pu1_src_top_left
+* Source top left boundary
+*
+* @param[in] pu1_src_top_right
+* Source top right boundary
+*
+* @param[in] pu1_src_bot_left
+* Source bottom left boundary
+*
+* @param[in] pu1_avail
+* boundary availability flags
+*
+* @param[in] pi1_sao_offset_u
+* Chroma U sao offset values
+*
+* @param[in] pi1_sao_offset_v
+* Chroma V sao offset values
+*
+* @param[in] pi1_sao_offset
+* Luma sao offset values
+*
+* @param[in] wd
+* width of the source
+
+* @param[in] ht
+* height of the source
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+ WORD8 offset = 0;
+
+ __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+ __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
+ __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
+ __m128i band_pos_16x8b;
+ __m128i sao_offset;
+ __m128i cmp_mask, cmp_store;
+
+ /* Updating left and top-left and top */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 1];
+ for(col = 0; col < wd; col += 8)
+ {
+ tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
+ offset += 8;
+ }
+
+ //replicating sao_band_pos as 8 bit value 16 times
+
+
+ band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
+ //value set for sao_offset extraction
+ tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
+ tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
+ tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
+ tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
+
+ //loaded sao offset values
+ sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+ band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+ band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+ band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+ band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+ //band_position addition
+ band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
+ band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
+ band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
+ band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
+ //sao_offset duplication
+ tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+ //settng for comparision
+ cmp_mask = _mm_set1_epi16(16);
+ cmp_store = _mm_set1_epi16(0x00ff);
+
+ //sao_offset addition
+ band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
+ band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
+ band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
+ band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
+ band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
+ band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
+ band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
+
+ switch(sao_band_pos)
+ {
+ case 0:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
+ band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
+ break;
+ case 28:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
+ band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
+ break;
+ case 29:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
+ band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
+ band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
+ break;
+ case 30:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
+ band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
+ band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
+ break;
+ case 31:
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
+ band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
+ tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
+ band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
+ break;
+ default:
+ break;
+ }
+ //sao_offset is reused for zero cmp mask.
+ sao_offset = _mm_setzero_si128();
+ tmp_set_128i_1 = _mm_set1_epi8(1);
+ //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
+ cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
+
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
+ band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
+ band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
+ band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
+
+ //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
+ band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
+ band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
+
+ band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
+ band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
+ band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
+
+ cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
+ // band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 2)
+ {
+
+
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values gret=ater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+
+
+ //row 0 and row1
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
+ //row 2 and row 3
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
+ //values 16 to 31 for row 2 & 3 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
+
+ //row 0 and row 1
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
+
+ //row 2 and row 3
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // combing results woth the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
+
+ pu1_src_cpy += (src_strd << 1);
+ }
+ pu1_src += 16;
+ }
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 4)
+ {
+
+
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ //row0 and row1 packed and row2 and row3 packed
+
+ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values gret=ater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+
+
+
+ //row 0 and row1
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
+ //row 2 and row 3
+ //if the values >16 then put ff ,cmp_mask = dup16(15)
+ cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
+ //values 16 to 31 for row 2 & 3 but values <16 masked to ff
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
+
+ //row 0 and row 1
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
+
+ //row 2 and row 3
+ //to preserve pixel values in which no offset needs to be added.
+ cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // combing results woth the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+
+ //Getting row1 separately
+ src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
+ //Getting row3 separately
+ src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
+
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ pu1_src += 8;
+ }
+
+
+}
+
+void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ WORD32 sao_band_pos_u,
+ WORD32 sao_band_pos_v,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ WORD8 offset = 0;
+
+
+ __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+ __m128i cmp_msk2;
+ __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
+ __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
+ __m128i band_pos_u_16x8b, band_pos_v_16x8b;
+ __m128i sao_offset;
+ __m128i cmp_mask;
+
+
+ /* Updating left and top and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+ pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ for(col = 0; col < wd; col += 8)
+ {
+ tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
+ offset += 8;
+ }
+
+ { // band _table creation
+ __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
+ // Band table for U component : band_table0_16x8b and band_table2_16x8b
+ //replicating sao_band_pos as 8 bit value 16 times
+ band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
+ //value set for sao_offset extraction
+ tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
+ tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
+ tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
+ tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
+
+ //loaded sao offset values
+ sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+
+ //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+ band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+ band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+ band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+ band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+ //band_position addition
+ band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
+ band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
+ //sao_offset duplication
+ temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+ temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+ temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+ temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+
+ //sao_offset addition
+ band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
+ band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
+ //reuse for clipping
+ temp1_8x16b = _mm_set1_epi16(0x00ff);
+ //settng for comparision
+ cmp_mask = _mm_set1_epi16(16);
+
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+
+ //temp1_8x16b reuse for compare storage
+ switch(sao_band_pos_u)
+ {
+ case 0:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
+ band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
+ break;
+ case 28:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 29:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
+ band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 30:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
+ band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
+ break;
+ case 31:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
+ band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
+ break;
+ default:
+ break;
+ }
+ //masking upper 8bit values of each 16 bit band table value
+ band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+ //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
+ band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
+ band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
+ // Band table for U component over
+
+ // Band table for V component : band_table1_16x8b and band_table3_16x8b
+ // replicating sao_band_pos as 8 bit value 16 times
+ band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
+
+ //loaded sao offset values
+ sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+
+ //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
+ temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
+ band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
+ temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
+ band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
+
+ //band_position addition
+ temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
+ temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
+ //sao_offset duplication
+ tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
+
+ //sao_offset addition
+ temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
+ band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
+ temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
+ band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
+
+ //masking upper 8bit values of 16 bit band table value
+ temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+ //temp1_8x16b reuse for compare storage
+
+ switch(sao_band_pos_v)
+ {
+ case 0:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
+ temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
+ break;
+ case 28:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 29:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
+ temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
+ break;
+ case 30:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
+ temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
+ break;
+ case 31:
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
+ temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
+ temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
+ break;
+ default:
+ break;
+ }
+ //masking upper 8bit values of each 16 bit band table value
+ temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
+ band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
+ temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
+ band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
+ //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
+ band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
+ band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
+ //band table for u and v created
+ }
+ {
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+
+
+ //sao_offset is reused for zero cmp mask.
+ sao_offset = _mm_setzero_si128();
+ tmp_set_128i_1 = _mm_set1_epi8(1);
+ //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
+ cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
+ //to avoid ffff to be saturated to 0 instead it should be to ff
+
+ cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
+ band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
+ band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
+ cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
+
+ cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 2)
+ {
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+ //odd values
+ src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //even values
+ src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
+ src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //combining odd values
+ src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ //combining even values
+ src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values greater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // registers reused to increase performance
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
+ src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
+ src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
+
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
+
+
+ //to choose which pixel values to preserve in row 0 and row 1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ //to choose which pixel values to preserve in row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ //values of all rows to which no offset needs to be added preserved.
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
+ //indexing 16 -31 bandtable indexes
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
+ // combing results with the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+ //reorganising even and odd values
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
+
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
+
+
+ pu1_src_cpy += (src_strd << 1);
+
+ }
+ pu1_src += 16;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ for(row = ht; row > 0; row -= 4)
+ {
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ //row0 and row1 packed and row2 and row3 packed
+
+ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
+ //odd values
+ src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //even values
+ src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
+ src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
+ src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
+ //combining odd values
+ src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ //combining even values
+ src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
+
+ //saturated substract 8 bit
+ tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
+ tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
+ //if the values less than 0 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ //if the values greater than 31 put ff
+ tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
+ tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
+ // registers reused to increase performance
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
+ src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
+ //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
+ src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
+
+ //values 16 to 31 for row 0 & 1 but values <16 ==0
+ tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
+ // values 0 to 15 for row 0 & 1
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
+ //values 16 to 31 for row 2 & 3 but values <16 ==0
+ tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
+ // values 0 to 15 for row 2 & 3
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
+
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
+ //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
+ tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
+ tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
+
+
+ //to choose which pixel values to preserve in row 0 and row 1
+ src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
+ //to choose which pixel values to preserve in row 2 and row 3
+ src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
+ //values of all rows to which no offset needs to be added preserved.
+ src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
+ src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
+
+ //indexing 0 - 15 bandtable indexes
+ tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
+ tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
+ //indexing 16 -31 bandtable indexes
+ tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
+ tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
+ // combining all offsets results
+ tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
+ tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
+ // combing results with the pixel values
+ src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
+ src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
+ //reorganising even and odd values
+ src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
+ //Getting row1 separately
+ src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
+ //Getting row3 separately
+ src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
+
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ pu1_src += 16;
+ }
+
+
+ }
+}
+
+
+
+void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+ UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+ UWORD8 u1_avail0, u1_avail1;
+ WORD32 wd_rem;
+ WORD32 offset = 0;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i left0_16x8b, left1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ /* Update top and top-left arrays */
+
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
+ _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
+ offset += 16;
+ }
+
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str = au1_src_left_tmp1;
+ {
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //pu1_src_left_cpy =au1_src_left_tmp;
+ for(row = ht; row > 0; row -= 2)
+ {
+
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
+ //row 1 left
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+ //row 0 left
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
+ // row = 1 right
+ edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ au1_mask_cpy += 16;
+ pu1_src += 16;
+ pu1_src_left_cpy -= ht;
+ pu1_src_left_str -= ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+
+ cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
+
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //pu1_src_left_cpy =au1_src_left_tmp;
+ for(row = ht; row > 0; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
+ //row 3 left
+ edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
+ cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+ //row 2 left
+ edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+ //row 1 left
+ edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+ //row 0 left
+ edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
+
+ // packing rows together for 16 SIMD operations
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
+ // packing rows together for 16 SIMD operations
+ left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
+ // row = 1 right
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
+ // row = 2 right
+ edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
+ // row = 3 right
+ cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
+ // packing rows together for 16 SIMD operations
+ edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
+ edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
+ cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //separting row 1 and row 3
+ cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
+
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ pu1_src += wd;
+ pu1_src_left_cpy -= ht;
+ pu1_src_left_str -= ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+}
+
+
+void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 u1_avail0, u1_avail1;
+ WORD32 wd_rem;
+ WORD32 offset = 0;
+
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i left0_16x8b, left1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ __m128i chroma_offset_8x16b;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ /* Update top and top-left arrays */
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];;
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
+ _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
+ offset += 16;
+ }
+ for(row = 0; row < 2 * ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[1] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ au1_mask[wd - 2] = u1_avail1;
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+ {
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str = au1_src_left_tmp1;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+
+ for(row = ht; row > 0; row -= 2)
+ {
+
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ // row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
+ //row 1 left
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+ //row 0 left
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+
+ //separating +ve and and -ve values.row 0 left
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //separating +ve and and -ve values.row 1 left
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
+ // row = 1 right
+ edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ //separating +ve and and -ve values.row 0 right
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //separating +ve and and -ve values.row 1 right
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ au1_mask_cpy += 16;
+ pu1_src += 16;
+ pu1_src_left_cpy -= 2 * ht;
+ pu1_src_left_str -= 2 * ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+
+ cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
+ _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
+
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
+ pu1_src_cpy = pu1_src;
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+
+ for(row = ht; row > 0; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ // row = 1
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 3
+ cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
+ //row 3 left
+ edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
+ left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+ //row 2 left
+ edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+
+
+ // packing rows together for 16 SIMD operations
+ src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
+ left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
+
+ //row 1 left
+ edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
+ edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+ //row 0 left
+ edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
+ // packing rows together for 16 SIMD operations
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
+ left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
+
+ //separating +ve and and -ve values.for row 2 and row 3
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+
+
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ //row = 0 right
+ edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
+ // row = 1 right
+ cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
+ // row = 2 right
+ edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
+ // row = 3 right
+ cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
+ // packing rows together for 16 SIMD operations
+ edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
+ edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
+
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //seaprting row 1 and row 3
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ // row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 8;
+ pu1_src_left_str += 8;
+ }
+ pu1_src += wd;
+ pu1_src_left_cpy -= 2 * ht;
+ pu1_src_left_str -= 2 * ht;
+
+ pu1_left_tmp = pu1_src_left_cpy;
+ pu1_src_left_cpy = pu1_src_left_str;
+ pu1_src_left_str = pu1_left_tmp;
+ }
+ for(row = 0; row < 2 * ht; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+
+void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy;
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+
+ /* Updating left and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+
+
+
+ pu1_src_top_cpy = pu1_src_top;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+ {
+ WORD32 ht_rem;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ for(row = ht; row >= 4; row -= 4)
+ {
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row = 3
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ // row = 4
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
+ //separating +ve and and -ve values.(3,4)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
+
+ edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //the next top already in src_top_16x8b
+ //src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
+ src_top_16x8b = src_temp1_16x8b;
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ }
+ }
+}
+
+void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy;
+ UWORD8 *pu1_src_cpy;
+ WORD32 wd_rem;
+
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i chroma_offset_8x16b;
+
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ /* Updating left and top and top-left */
+ for(row = 0; row < ht; row++)
+ {
+ pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
+ pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
+ }
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+
+
+
+ pu1_src_top_cpy = pu1_src_top;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+
+ {
+ WORD32 ht_rem;
+
+
+
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ for(row = ht; row >= 4; row -= 4)
+ {
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row = 3
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
+ // row = 4
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
+ //separating +ve and and -ve values.(3,4)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
+
+ edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //for the next iteration signup0_16x8b = -signdwn1_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
+ src_top_16x8b = src_temp1_16x8b;
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
+
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ }
+ }
+}
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+ UWORD8 *pu1_firstleft;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+ UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+ WORD32 wd_rem;
+ UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
+ WORD32 ht_tmp, ht_0;
+
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ ht_0 = ht; ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ for(row = 0; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str2 = au1_src_left_tmp1;
+ pu1_src_left_str = au1_src_left_tmp1;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+
+ /* If top-left is available, process separately */
+ if(0 != pu1_avail[4])
+ {
+ WORD8 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+ SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp = pu1_src[0];
+ }
+ }
+ else
+ {
+ u1_pos_0_0_tmp = pu1_src[0];
+ }
+
+ /* If bottom-right is available, process separately */
+ if(0 != pu1_avail[7])
+ {
+ WORD8 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
+ SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ pu1_firstleft = pu1_src_top_left;
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_firstleft = pu1_src_left_cpy2;
+ pu1_src_left_cpy2++;
+ pu1_src_left_str2++;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ ht_0--;
+ }
+ //storing top left in a mmx register
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+ left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //update top -left
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ {
+ WORD32 ht_rem;
+
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 1 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ //to insert left in row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ // row = 2 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //storing the row 1 left for next row.
+ signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+ //manipulation for bottom - row 1
+ signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 1;
+ pu1_src_left_str += 1;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[0] = pu1_src_cpy[15];
+ }
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
+ }
+
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //right row1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 0 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //right row2
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for row 2 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //manipulation for row 3 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //right row3
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //right row 4
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+ //manipulation for bottom -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row 1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -bottom
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for bottom -row1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //manipulation for bottom- row 1
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //for storing right of row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ //row 0 -row1
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 1;
+ pu1_src_left_str += 1;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[0] = pu1_src_cpy[7];
+ }
+
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
+ }
+
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ au1_mask_cpy += 16;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+ pu1_src_org[0] = u1_pos_0_0_tmp;
+ pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
+ pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
+ for(row = 0; row < ht_tmp; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+/* 135 degree filtering */
+void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+ UWORD8 *pu1_firstleft;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
+ WORD32 wd_rem;
+ UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
+ WORD32 ht_tmp;
+ WORD32 ht_0;
+
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+ __m128i chroma_offset_8x16b;
+
+ UNUSED(pu1_src_top_right);
+ UNUSED(pu1_src_bot_left);
+
+ ht_0 = ht; ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+ /* Updating left and top-left */
+ for(row = 0; row < 2 * ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str2 = au1_src_left_tmp1;
+ pu1_src_left_str = au1_src_left_tmp1;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+
+ /* If top-left is available, process separately */
+ if(0 != pu1_avail[4])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
+ SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp_u = pu1_src[0];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
+ SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_0_tmp_v = pu1_src[1];
+ }
+ }
+ else
+ {
+ u1_pos_0_0_tmp_u = pu1_src[0];
+ u1_pos_0_0_tmp_v = pu1_src[1];
+ }
+
+ /* If bottom-right is available, process separately */
+ if(0 != pu1_avail[7])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
+ SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
+ SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
+ u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
+ }
+ pu1_firstleft = pu1_src_top_left;
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_firstleft = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 += 2;
+ pu1_src_left_str2 += 2;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ ht_0--;
+ }
+ //storing top left in a mmx register
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+ left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[1] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ au1_mask[wd - 2] = u1_avail1;
+
+ /* top-left arrays */
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ {
+ WORD32 ht_rem;
+ au1_mask_cpy = au1_mask;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 1 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ //to insert left in row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ // row = 2 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //storing the row 1 left for next row.
+ signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+ //manipulation for bottom - row 1
+ signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row1 getting it right for left of next iteration
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //row0 getting its right for left of next iteration.
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and botton and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[1] = pu1_src_cpy[15];
+ pu1_src_left_str[0] = pu1_src_cpy[14];
+ }
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
+ pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
+ }
+
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
+ //row = 0
+ src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
+ src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //right row1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 0 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //right row2
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for row 2 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //manipulation for row 3 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //right row3
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //right row 4
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+ //manipulation for bottom -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8);
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
+
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 8;
+ pu1_src_left_str += 8;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //row 0 -row 1
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //manipulation for row 1 -row 0
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -bottom
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for bottom -row1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //manipulation for bottom- row 1
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //shifting row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //the next top in src_top_16x8b
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+ //row 0 -row1
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ src_top_16x8b = src_temp0_16x8b;
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ pu1_src_left_str[1] = pu1_src_cpy[7];
+ pu1_src_left_str[0] = pu1_src_cpy[6];
+ }
+
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
+ pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
+ }
+
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+ pu1_src_org[0] = u1_pos_0_0_tmp_u;
+ pu1_src_org[1] = u1_pos_0_0_tmp_v;
+ pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
+ pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
+ pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
+ for(row = 0; row < 2 * ht_tmp; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
+ UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ WORD32 wd_rem;
+ UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
+ WORD32 ht_tmp;
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i const2_16x8b, const0_16x8b;
+ __m128i left_store_16x8b;
+
+ ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+ au1_src_left_tmp[0] = pu1_src[(wd - 1)];
+ //manipulation for bottom left
+ for(row = 1; row < ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ au1_src_left_tmp[ht] = pu1_src_bot_left[0];
+
+ *pu1_src_top_left = pu1_src_top[wd - 1];
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ pu1_src_left_str2 = au1_src_left_tmp1;
+ pu1_src_left_str = au1_src_left_tmp1;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
+
+ /* If top-right is available, process separately */
+ if(0 != pu1_avail[5])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
+ SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp = pu1_src[wd - 1];
+ }
+ }
+ else
+ {
+ u1_pos_wd_0_tmp = pu1_src[wd - 1];
+ }
+
+ /* If bottom-left is available, process separately */
+ if(0 != pu1_avail[6])
+ {
+ WORD32 edge_idx;
+
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+ }
+ }
+ else
+ {
+ u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
+ }
+
+
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_cpy2++;
+ pu1_src_left_str2++;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ {
+ WORD32 ht_rem;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ // row = 0 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
+
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 1 right
+ signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //manipulation for row 1 - bottom
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+
+ //row1 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and bottom and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy++;
+ pu1_src_left_str++;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ }
+
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulatiing for row 1 -row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //manipulation for row 1 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+ //row 1 right
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 right
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 2 -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //manipulation for row 3 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 11);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 right
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 4;
+ pu1_src_left_str += 4;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //manipulation for row 1 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+ //manipulation for bottom- row 1 (row 1 right)
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+ //for storing right of row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 2;
+ pu1_src_left_str += 2;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+ //manipulation for row 0 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ src_top_16x8b = src_temp0_16x8b;
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy++;
+ pu1_src_left_str++;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
+ src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+
+ pu1_left_tmp = pu1_src_left_cpy2;
+ pu1_src_left_cpy2 = pu1_src_left_str2;
+ pu1_src_left_str2 = pu1_left_tmp;
+
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ pu1_src_left_str = pu1_src_left_str2;
+
+ }
+ pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
+ pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
+ pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
+ pu1_src_left[0] = au1_src_left_tmp[0];
+ for(row = 1; row < ht_tmp; row++)
+ {
+ pu1_src_left[row] = pu1_src_left_cpy[row];
+ }
+ }
+
+}
+
+void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ UWORD8 *pu1_src_left,
+ UWORD8 *pu1_src_top,
+ UWORD8 *pu1_src_top_left,
+ UWORD8 *pu1_src_top_right,
+ UWORD8 *pu1_src_bot_left,
+ UWORD8 *pu1_avail,
+ WORD8 *pi1_sao_offset_u,
+ WORD8 *pi1_sao_offset_v,
+ WORD32 wd,
+ WORD32 ht)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
+ UWORD8 *pu1_src_cpy, *pu1_src_org;
+ UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
+ UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
+ WORD32 wd_rem;
+ UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
+ WORD32 ht_tmp;
+ WORD32 bit_depth;
+ UWORD8 u1_avail0, u1_avail1;
+
+ __m128i src_top_16x8b, src_bottom_16x8b;
+ __m128i src_temp0_16x8b, src_temp1_16x8b;
+ __m128i signup0_16x8b, signdwn1_16x8b;
+ __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
+ __m128i edge0_16x8b, edge1_16x8b;
+ __m128i au1_mask8x16b;
+ __m128i edge_idx_8x16b, sao_offset_8x16b;
+ __m128i left_store_16x8b;
+ __m128i const0_16x8b, const2_16x8b;
+ __m128i chroma_offset_8x16b;
+
+ ht_tmp = ht;
+ au1_mask8x16b = _mm_set1_epi8(0xff);
+
+
+ au1_src_left_tmp[0] = pu1_src[(wd - 2)];
+ au1_src_left_tmp[1] = pu1_src[(wd - 1)];
+ //manipulation for bottom left
+ for(row = 2; row < 2 * ht; row++)
+ {
+ au1_src_left_tmp[row] = pu1_src_left[row];
+ }
+ au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
+ au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
+
+ pu1_src_top_left[0] = pu1_src_top[wd - 2];
+ pu1_src_top_left[1] = pu1_src_top[wd - 1];
+ //setting availability mask to ff size MAX_CTB_SIZE
+ for(col = 0; col < MAX_CTB_SIZE; col += 16)
+ _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
+ bit_depth = BIT_DEPTH_LUMA;
+ pu1_src_org = pu1_src;
+ pu1_src_top_cpy = pu1_src_top;
+ pu1_src_left_cpy2 = au1_src_left_tmp;
+ pu1_src_left_cpy = au1_src_left_tmp;
+ edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
+ sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
+ const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
+ chroma_offset_8x16b = _mm_set1_epi16(0x0800);
+ /* If top-right is available, process separately */
+ if(0 != pu1_avail[5])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
+ SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
+ SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+ }
+ }
+ else
+ {
+ u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
+ u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
+ }
+
+ /* If bottom-left is available, process separately */
+ if(0 != pu1_avail[6])
+ {
+ WORD32 edge_idx;
+
+ /* U */
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+ }
+
+ /* V */
+ edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
+ SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
+
+ edge_idx = gi1_table_edge_idx[edge_idx];
+
+ if(0 != edge_idx)
+ {
+ u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+ }
+ }
+ else
+ {
+ u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
+ u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
+ }
+
+
+
+ /* Update height and source pointers based on the availability flags */
+ if(0 == pu1_avail[2])
+ {
+ pu1_src_left_cpy2 += 2;
+ pu1_src_top_cpy = pu1_src;
+ pu1_src += src_strd;
+ ht--;
+ }
+ if(0 == pu1_avail[3])
+ {
+ ht--;
+ }
+
+ sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
+ const2_16x8b = _mm_set1_epi8(2);
+ const0_16x8b = _mm_setzero_si128();
+
+
+ //availability mask creation
+ u1_avail0 = pu1_avail[0];
+ u1_avail1 = pu1_avail[1];
+ au1_mask[0] = u1_avail0;
+ au1_mask[1] = u1_avail0;
+ au1_mask[wd - 1] = u1_avail1;
+ au1_mask[wd - 2] = u1_avail1;
+ {
+ WORD32 ht_rem;
+ au1_mask_cpy = au1_mask;
+ for(col = wd; col >= 16; col -= 16)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+
+ //loading the mask
+ au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+
+ for(row = ht; row >= 2; row -= 2)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
+ //row = 1
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ // row = 0 right
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
+
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
+ //combining sign-left and sign_right
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ // row = 1 right
+ signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
+
+ //bottom - row1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration bottom -row1
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //manipulation for row 1 - bottom
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //combining sign-left and sign_right
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
+
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ //row1 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //copying the next top
+ src_top_16x8b = src_temp1_16x8b;
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
+ src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
+
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ }
+ ht_rem = ht & 0x1;
+
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ //to insert left in row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+ //current row -next row
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and bottom and constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ //eliminating old left for row 0 and row 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //row0 getting it right for left of next block
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //copying the next top
+ src_top_16x8b = src_temp0_16x8b;
+
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+
+
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ //store left boundary
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+
+ _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy += 2;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+ //for the top left of next part of the block
+ left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
+ //updating top flag
+ _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 16;
+ au1_mask_cpy += 16;
+ }
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ wd_rem = wd & 0xF;
+ if(wd_rem)
+ {
+ pu1_src_cpy = pu1_src;
+ src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
+ //row = 0
+ src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
+ au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //preparing au1_mask
+ au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_left_cpy = pu1_src_left_cpy2;
+ for(row = ht; row >= 4; row -= 4)
+ {
+ left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //row 1 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //row 0 -row1
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulatiing for row 1 -row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //row 1 -row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row1-row0
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //manipulation for row 1 -row 2
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //row 2 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+ //packing row 0 n row 1
+ src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
+ //row1 -row2
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+ //row 1 right
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //row = 3
+ src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
+
+ // row = 4
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
+
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+
+ //separating +ve and and -ve values.(2,1)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //row 2 right
+ signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
+ //combining the appropriate sign change
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
+
+ //separating +ve and and -ve values.(3,2)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 2 -row 3
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8);
+ //row 3 left
+ signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
+
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
+
+ //separating +ve and and -ve values.(2,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //manipulation for row 3 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 6);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
+
+ //separating +ve and and -ve values.(3,bottom)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
+ edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
+
+
+ //eliminating old left for row 0,1,2,3
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
+ //packing row 2 n row 3
+ src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
+ //row 3 right
+ signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
+ //loading row 3 right into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
+ //adding bottom and top values of row 2 and row 3
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
+ //separating +ve and and -ve values.(botttom,3)
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+ //to store right of row 2
+ signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
+
+ //storing right of row 2into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ //to store right of row 0
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
+
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
+ cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
+ src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
+ edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
+ cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
+ src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+ cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ //row = 2
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
+ // row = 3
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
+
+ src_temp0_16x8b = src_temp1_16x8b;
+ signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
+ pu1_src_cpy += (src_strd << 2);
+ pu1_src_left_cpy += 8;
+ }
+ ht_rem = ht & 0x2;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+ // row = 2
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
+
+ //manipulation for row 0 -row 1
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //manipulation for row 1 - row 0
+ signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //row1-row0
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign chang
+ edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+
+ //manipulation for row 1 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
+ signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
+ //row1 -bottom
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
+
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
+ edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
+
+ //manipulation for bottom- row 1 (row 1 right)
+ signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
+ //bottom - row 1
+ cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
+
+ //eliminating old left for row 0,1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
+ signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //for the next iteration signup0_16x8b
+ signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
+
+ //storing right of row 1 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ //for storing right of row 1
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+
+ src_top_16x8b = src_temp1_16x8b;
+ //storing right of row 0 into left
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+ //the next top already in src_top_16x8b
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
+ edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
+
+ cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ // row = 1
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_cpy += (src_strd << 1);
+ pu1_src_left_cpy += 4;
+ }
+ ht_rem = ht & 0x1;
+ if(ht_rem)
+ {
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
+ src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
+
+
+ //manipulation for row 0 -bottom
+ signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
+ //bottom left
+ signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
+ //separating +ve and and -ve values.
+ cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
+ cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
+ //creating mask 00 for +ve and -ve values and FF for zero.
+ cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
+ //combining the appropriate sign change
+ edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
+ //adding top and down substraction
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
+ //for row 0 right to put into left store
+ signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ //adding constant 2
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
+ edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
+ edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
+ //left store manipulation 1
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ //filling the left boundary value
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
+ src_top_16x8b = src_temp0_16x8b;
+
+ //shuffle to get sao index
+ edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
+ //using availability mask
+ edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
+ //adding chroma offset to access U and V
+ edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
+ //shuffle to get sao offset
+ edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
+
+ //cnvert to 16 bit then add and then saturated pack
+ signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
+ src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
+ cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
+ src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
+ src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
+
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
+ _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
+ pu1_src_cpy += (src_strd);
+ src_temp0_16x8b = src_bottom_16x8b;
+ pu1_src_left_cpy += 2;
+ }
+ { //for bottom right
+ left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
+ left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
+ src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
+ left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
+ _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
+ }
+ if(0 == pu1_avail[3])
+ {
+ src_top_16x8b = src_bottom_16x8b;
+ }
+
+ _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
+ pu1_src += 8;
+ }
+ pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
+ pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
+ pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
+ pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
+ for(row = 0; row < 2 * ht_tmp; row++)
+ {
+ pu1_src_left[row] = au1_src_left_tmp[row];
+ }
+ }
+
+}
diff --git a/common/x86/ihevc_tables_x86_intr.c b/common/x86/ihevc_tables_x86_intr.c
new file mode 100644
index 0000000..0fc3de2
--- /dev/null
+++ b/common/x86/ihevc_tables_x86_intr.c
@@ -0,0 +1,120 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_tables_x86_intr.c
+*
+* @brief
+* Contains function Definition for intra prediction interpolation filters
+*
+*
+* @author
+* Rishab
+*
+* @par List of Functions:
+
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_tables_x86_intr.h"
+
+// LUMA INTRA PRED
+const UWORD8 IHEVCE_SHUFFLEMASKY1[16] = { 0x03, 0x02, 0x01, 0x00,
+ 0x02, 0x03, 0x03, 0x04,
+ 0x08, 0x08, 0x08, 0x08,
+ 0x08, 0x08, 0x08, 0x08 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY2[16] = { 0x07, 0x06, 0x05, 0x04,
+ 0x03, 0x02, 0x01, 0x00,
+ 0x08, 0x08, 0x08, 0x08,
+ 0x08, 0x08, 0x08, 0x08 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY3[16] = { 0x0f, 0x0e, 0x0d, 0x0c,
+ 0x0b, 0x0a, 0x09, 0x08,
+ 0x07, 0x06, 0x05, 0x04,
+ 0x03, 0x02, 0x01, 0x00 };
+
+const UWORD8 IHEVCE_SHUFFLEMASK4[16] = { 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00 };
+
+const UWORD8 IHEVCE_SHUFFLEMASK5[16] = { 0x00, 0x01, 0x08, 0x09,
+ 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f };
+/// CHROMA INTRA PRED
+const UWORD8 IHEVCE_SHUFFLEMASKY7[16] = { 0x06, 0x07, 0x04, 0x05,
+ 0x02, 0x03, 0x00, 0x01,
+ 0x08, 0x08, 0x08, 0x08,
+ 0x08, 0x08, 0x08, 0x08 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY8[16] = { 0x0e, 0x0f, 0x0c, 0x0d,
+ 0x0a, 0x0b, 0x08, 0x09,
+ 0x06, 0x07, 0x04, 0x05,
+ 0x02, 0x03, 0x00, 0x01 };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY9[16] = { 0x00, 0x01, 0x04, 0x05,
+ 0x08, 0x09, 0x0c, 0x0d,
+ 0x02, 0x03, 0x06, 0x07,
+ 0x0a, 0x0b, 0x0e, 0x0f };
+
+const UWORD8 IHEVCE_SHUFFLEMASKY11[16] = { 0x01, 0x00, 0x02, 0x01,
+ 0x03, 0x02, 0x04, 0x03,
+ 0x05, 0x04, 0x06, 0x05,
+ 0x07, 0x06, 0x08, 0x07 };
+//INTRAPRED
+const UWORD8 inv_angle_shuffle[7][32] =
+{
+ { 3, 0x80, 0x80, 0x80, 1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0x80, 0x80, 0x80, 0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15 },
+ { 6, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1, 2, 4, 5, 7, 8, 10, 11, 13, 14 },
+ { 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 4, 6, 8, 9, 11, 13, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1, 3, 5, 7, 8, 10, 12, 14 },
+ { 10, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 5, 7, 10, 12, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 1, 4, 6, 9, 11, 14 },
+ { 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 4, 7, 11, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2, 5, 9, 12 },
+ { 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 6, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 3, 10 },
+ { 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0 }
+};
+
+/// DEBLOCK TABLES
+const WORD8 coef_d[16] = { 0, 1, -2, 1, 1, -2, 1, 0, 0, 1, -2, 1, 1, -2, 1, 0 };
+const WORD8 coef_de1[16] = { 3, -9, 9, -3, 3, -9, 9, -3, 3, -9, 9, -3, 3, -9, 9, -3 };
+const WORD8 coef_dep1[16] = { -2, 1, 1, -2, -2, 1, 1, -2, -2, 1, 1, -2, -2, 1, 1, -2 };
+const WORD32 shuffle_d[4] = { 0x80800403, 0x80800c0b, 0x03000704, 0x0b080f0c };
+const WORD32 shuffle0[2] = { 0x80098001, 0x800e8006 };
+const WORD32 shuffle1[4] = { 0x05040100, 0x0d0c0908, 0x07060302, 0x0f0e0b0a };
+const WORD32 shuffle2[4] = { 0x80808080, 0x03020100, 0x07060504, 0x80808080 };
+const WORD32 shuffle3[4] = { 0x80808080, 0x0b0a0908, 0x0f0e0d0c, 0x80808080 };
+
+const WORD8 delta0[16] = { 1, -4, 1, -4, 1, -4, 1, -4, 1, -4, 1, -4, 1, -4, 1, -4 };
+const WORD8 delta1[16] = { 4, -1, 4, -1, 4, -1, 4, -1, 4, -1, 4, -1, 4, -1, 4, -1 };
+const WORD32 shuffle_uv[4] = { 0x03010200, 0x0b090a08, 0x07050604, 0x0f0d0e0c };
diff --git a/common/x86/ihevc_weighted_pred_sse42_intr.c b/common/x86/ihevc_weighted_pred_sse42_intr.c
new file mode 100644
index 0000000..94a3f6d
--- /dev/null
+++ b/common/x86/ihevc_weighted_pred_sse42_intr.c
@@ -0,0 +1,2115 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_weighted_pred_x86_intr.c
+*
+* @brief
+* Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+*
+*
+* @par List of Functions:
+* - ihevc_weighted_pred_uni_sse42()
+* - ihevc_weighted_pred_bi_sse42()
+* - ihevc_weighted_pred_bi_default_sse42()
+* - ihevc_weighted_pred_chroma_uni_sse42()
+* - ihevc_weighted_pred_chroma_bi_sse42()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_weighted_pred.h"
+#include "ihevc_inter_pred.h"
+
+#include <immintrin.h>
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does uni-weighted prediction on the array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp;
+ WORD32 dst0, dst1, dst2, dst3;
+
+ /* all 128 bit registers are named with a suffix mxnb, where m is the */
+ /* number of n bits packed in the register */
+ __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b;
+ __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
+
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 4 == 0); /* checking assumption*/
+
+ temp = 1 << (shift - 1);
+
+ // seting values in register
+ const_temp_4x32b = _mm_set1_epi32(temp);
+ lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
+ wgt0_4x32b = _mm_set1_epi32(wgt0);
+ off0_4x32b = _mm_set1_epi32(off0);
+
+ if(0 == (wd & 7)) /* wd multiple of 8 case */
+ {
+ __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 8)
+ { /* for row =0 ,1,2,3*/
+
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+ /* row = 2 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
+ /* row = 3 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
+
+ /* row = 0 */ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
+ /* row = 1 */
+ src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
+ /* row = 2 */
+ src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4));
+ /* row = 3 */
+ src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+
+ /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */
+ src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+ src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+ src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+
+ /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
+
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
+ src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+ src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
+ src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */ /* First 4 pixels */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+ src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
+ src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
+ src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
+ src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b);
+ src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
+ src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b);
+ src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
+ src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+ src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b);
+ src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src += 8;
+ pu1_dst += 8;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else /* wd multiple of 4 case */
+ {
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 4)
+ { /* for row =0 ,1,2,3*/
+
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+ /* row = 2 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
+ /* row = 3 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+ src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
+ src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
+ /* dst row = 1 to 3 */
+ src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
+ src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2);
+ src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
+ dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
+ dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+ *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+ *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+ /* To update pointer */
+ pi2_src += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp, wdx2;
+ /* all 128 bit registers are named with a suffix mxnb, where m is the */
+ /* number of n bits packed in the register */
+
+ __m128i src_temp0_4x32b, src_temp1_4x32b;
+ __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ temp = 1 << (shift - 1);
+ wdx2 = 2 * wd;
+
+ // seting values in register
+ const_temp_4x32b = _mm_set1_epi32(temp);
+ lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
+ wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+ off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
+
+#if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */
+ if( 0 == (ht & 3)) /* ht multiple of 4 case */
+ {
+ if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
+ {
+ __m128i src_temp2_4x32b, src_temp3_4x32b;
+ __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+ __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b;
+ __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row +=4)
+ {
+ for(col = 0; col < wdx2; col +=16)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
+ /* row = 0 */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
+ /* row = 1 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
+ /* row = 0 */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8));
+ /* row = 1 */
+ src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8));
+ /* row = 0 */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12));
+ /* row = 1 */
+ src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
+ src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
+ src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+ src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
+ src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
+ src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
+ src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
+ src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
+ src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
+ src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
+ src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
+ src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b);
+ src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b);
+ src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b);
+ src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b);
+ src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b);
+
+ /* store 16 8-bit output values */
+ _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
+ _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
+
+ /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
+ /* row = 3 */
+ src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
+ /* row = 2 */ /* Second 4 pixels */
+ src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
+ /* row = 3 */
+ src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
+ /* row = 2 */ /* Third 4 pixels */
+ src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8));
+ /* row = 3 */
+ src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8));
+ /* row = 2 */ /* Last 4 pixels */
+ src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12));
+ /* row = 3 */
+ src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
+ src_temp9_4x32b = _mm_cvtepi16_epi32(src_temp9_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b);
+ src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp8_4x32b = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b);
+ src_temp9_4x32b = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
+ src_temp10_4x32b = _mm_cvtepi16_epi32(src_temp10_4x32b);
+ src_temp11_4x32b = _mm_cvtepi16_epi32(src_temp11_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b);
+ src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp10_4x32b = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b);
+ src_temp11_4x32b = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
+ src_temp12_4x32b = _mm_cvtepi16_epi32(src_temp12_4x32b);
+ src_temp13_4x32b = _mm_cvtepi16_epi32(src_temp13_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b);
+ src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp12_4x32b = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b);
+ src_temp13_4x32b = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+ src_temp14_4x32b = _mm_cvtepi16_epi32(src_temp14_4x32b);
+ src_temp15_4x32b = _mm_cvtepi16_epi32(src_temp15_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b);
+ src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp14_4x32b = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b);
+ src_temp15_4x32b = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b);
+ src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b, shift);
+ src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
+ src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b);
+ src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b, shift);
+ src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
+ src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b);
+ src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b, shift);
+ src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+ src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b);
+ src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b, shift);
+ src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b);
+ src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+ src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b);
+ src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+ src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b);
+ src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b);
+ src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b);
+
+ src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b);
+ src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b);
+ src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b);
+ src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b);
+ src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b);
+
+ /* store 16 8-bit output values */
+ _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/
+ _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/
+
+ pi2_src += 16; /* Pointer update */
+ pu1_dst += 16; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
+ }
+ }
+ else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
+ {
+ __m128i src_temp2_4x32b,src_temp3_4x32b;
+ __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row +=4)
+ {
+ for(col = 0; col < wdx2; col +=8)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
+ /* row = 2 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
+ /* row = 3 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
+
+ /* row = 0 */ /* Last 4 pixels */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
+ /* row = 1 */
+ src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
+ /* row = 2 */
+ src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
+ /* row = 3 */
+ src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
+ src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
+ src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+ src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
+ src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
+ src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
+ src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
+ src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
+ src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
+ src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
+ src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
+ src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b);
+ src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b);
+ src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b);
+ src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b);
+ src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b);
+ src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b);
+ src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
+ _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/
+
+ pi2_src += 8; /* Pointer update */
+ pu1_dst += 8; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
+ }
+ }
+ else /* 2*wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+ __m128i src_temp2_4x32b,src_temp3_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row +=4)
+ {
+ for(col = 0; col < wdx2; col +=4)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd));
+ /* row = 2 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
+ /* row = 3 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
+
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
+ src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b);
+ src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
+ /* dst row = 1 to 3 */
+ src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1);
+ src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2);
+ src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3);
+
+ /* store four 8-bit output values */
+ *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
+ dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
+ dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
+ /* row = 1 */
+ *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
+ /* row = 2 */
+ *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
+ /* row = 3 */
+ *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
+
+ pi2_src += 4; /* Pointer update */
+ pu1_dst += 4; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
+ }
+ }
+ }
+ else /* ht multiple of 2 case */
+#endif
+
+ {
+ if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
+ {
+ __m128i src_temp2_4x32b, src_temp3_4x32b;
+ __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 16)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+ /* row = 0 */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
+ /* row = 1 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
+ /* row = 0 */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
+ /* row = 1 */
+ src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
+ /* row = 0 */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12));
+ /* row = 1 */
+ src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
+ src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+ src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
+ src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
+ src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
+ src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
+ src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+ src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b);
+ src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b);
+ src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b);
+
+ /* store 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
+
+ pi2_src += 16; /* Pointer update */
+ pu1_dst += 16; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+ else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
+ {
+ __m128i src_temp2_4x32b, src_temp3_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+ /* row = 0 */ /* Last 4 pixels */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
+ /* row = 1 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+
+ /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
+ src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
+ src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
+
+ pi2_src += 8; /* Pointer update */
+ pu1_dst += 8; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+ else /* 2*wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+
+ /* (pi2_src[col] + lvl_shift)*/
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
+
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
+ src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+
+ /* i4_tmp += 1 << (shift - 1) */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */
+ src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
+
+ src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
+ /* dst row = 1 to 3 */
+ src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
+ /* row = 1 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ pi2_src += 4; /* Pointer update */
+ pu1_dst += 4; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 wgt1,
+ WORD32 off1,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp;
+
+ __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
+ __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
+
+
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ temp = (off0 + off1 + 1) << (shift - 1);
+
+ // seting values in register
+ const_temp_4x32b = _mm_set1_epi32(temp);
+ lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
+ lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
+ wgt0_4x32b = _mm_set1_epi32(wgt0);
+ wgt1_4x32b = _mm_set1_epi32(wgt1);
+
+ if(0 == (wd & 7)) /* wd multiple of 8 case */
+ {
+ __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+ /* Next 4 pixels */
+ src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
+ src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
+ src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
+ src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ /* (pi2_src1[col] + lvl_shift1) */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+ /* (pi2_src2[col] + lvl_shift2) */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+ /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+ /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+ src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+ /* Next 4 Pixels */
+ src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+ src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
+ src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+ src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
+ src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+ src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
+ src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
+ src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+ src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /* Next 4 Pixels */
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+ src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
+ src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
+
+ src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
+ src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+ src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
+
+ pi2_src1 += 8; /* Pointer update */
+ pi2_src2 += 8; /* Pointer update */
+ pu1_dst += 8; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
+
+ } /* outer loop ends */
+ }
+ else /* wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ /* (pi2_src1[col] + lvl_shift1) */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+ /* (pi2_src2[col] + lvl_shift2) */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+ /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+ /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+ src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
+
+ /* dst row = 1 to 3 */
+ src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
+
+ /* row = 1 to 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ pi2_src1 += 4; /* Pointer update */
+ pi2_src2 += 4; /* Pointer update */
+ pu1_dst += 4; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
+
+ } /* outer loop ends */
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 wgt1_cb,
+ WORD32 wgt1_cr,
+ WORD32 off1_cb,
+ WORD32 off1_cr,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp1, temp2;
+ WORD32 wdx2;
+
+ __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
+ __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
+
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
+ temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
+
+ // seting values in register
+ const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
+ lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
+ lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
+ wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+ wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
+
+ wdx2 = wd * 2;
+
+ if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
+ {
+ __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+ /* Next 4 pixels */
+ src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
+ src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
+ src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
+ src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ /* (pi2_src1[col] + lvl_shift1) */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+ /* (pi2_src2[col] + lvl_shift2) */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+ /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+ /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+ src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+ /* Next 4 Pixels */
+ src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
+ src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
+ src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
+ src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
+ src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
+ src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
+ src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
+ src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
+ src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
+ src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ /* Next 4 Pixels */
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
+ src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
+ src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
+ src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
+ src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
+
+ src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
+ src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+ src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
+
+ pi2_src1 += 8; /* Pointer update */
+ pi2_src2 += 8; /* Pointer update */
+ pu1_dst += 8; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+
+ } /* outer loop ends */
+ }
+ else /* wdx2 multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
+ src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
+ /* (pi2_src1[col] + lvl_shift1) */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
+ /* (pi2_src2[col] + lvl_shift2) */
+ src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
+ /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
+ src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
+ /*(pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
+
+ src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
+ src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
+ src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
+ src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
+ src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
+
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
+ src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
+ src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
+
+ src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
+
+ /* dst row = 1 to 3 */
+ src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
+
+ /* row = 1 to 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ pi2_src1 += 4; /* Pointer update */
+ pi2_src2 += 4; /* Pointer update */
+ pu1_dst += 4; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+* >> shift where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+* Assumption : ht%4 == 0, wd%4 == 0
+* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
+* final result will match even if intermediate precision is in 16 bit.
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp;
+ WORD32 shift;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ temp = 1 << (shift - 1);
+
+ // seting values in register
+ lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
+ lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
+ const_temp_8x16b = _mm_set1_epi16(temp);
+
+ lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
+ lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
+
+ if(0 == (ht & 3)) /* ht multiple of 4*/
+ {
+ if(0 == (wd & 15)) /* wd multiple of 16 case */
+ {
+ __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+ __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 16)
+ {
+ /*load 8 pixel values */ /* First 8 Values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /*load 8 pixel values */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+ src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+ /* row = 1 */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+ /* row = 2 */
+ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
+ src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+ /*load 8 pixel values */ /* Second 8 Values */
+ /* row = 3 */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
+ src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+ /* (i4_tmp >> shift) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+ src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
+ src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift);
+ src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
+ src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
+
+ /* store four 8-bit output values */ /* 16 8 Values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src1 += 16;
+ pi2_src2 += 16;
+ pu1_dst += 16;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else if(0 == (wd & 7)) /* multiple of 8 case */
+ {
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 8 pixel values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+ src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src1 += 8;
+ pi2_src2 += 8;
+ pu1_dst += 8;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else /* wd multiple of 4 case*/
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /* Pack two rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
+ src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ /* dst row = 1 to 3 */
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+ src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
+ dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+ *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+ *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+ /* To update pointer */
+ pi2_src1 += 4;
+ pi2_src2 += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ }
+ else /* ht multiple of 2 case and wd multiple of 4 case*/
+ {
+
+ WORD32 dst0, dst1;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+
+ /* Pack two rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ /* dst row = 1 to 3 */
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ /* To update pointer */
+ pi2_src1 += 4;
+ pi2_src2 += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
+
+ }
+
+ }
+
+}
diff --git a/common/x86/ihevc_weighted_pred_ssse3_intr.c b/common/x86/ihevc_weighted_pred_ssse3_intr.c
new file mode 100644
index 0000000..b8778a3
--- /dev/null
+++ b/common/x86/ihevc_weighted_pred_ssse3_intr.c
@@ -0,0 +1,2386 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_weighted_pred_atom_intr.c
+*
+* @brief
+* Contains function definitions for weighted prediction used in inter
+* prediction
+*
+* @author
+*
+*
+* @par List of Functions:
+* - ihevc_weighted_pred_uni_ssse3()
+* - ihevc_weighted_pred_bi_ssse3()
+* - ihevc_weighted_pred_bi_default_ssse3()
+* - ihevc_weighted_pred_chroma_uni_ssse3()
+* - ihevc_weighted_pred_chroma_bi_ssse3()
+* - ihevc_weighted_pred_chroma_bi_default_ssse3()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <assert.h>
+
+#include "ihevc_debug.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_func_selector.h"
+#include "ihevc_defs.h"
+#include "ihevc_weighted_pred.h"
+#include "ihevc_inter_pred.h"
+
+
+#include <immintrin.h>
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does uni-weighted prediction on the array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp;
+
+ /* all 128 bit registers are named with a suffix mxnb, where m is the */
+ /* number of n bits packed in the register */
+ __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
+ __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
+ __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b;
+
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 4 == 0); /* checking assumption*/
+
+ temp = 1 << (shift - 1);
+
+ // seting values in register
+ lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
+ wgt0_8x16b = _mm_set1_epi16(wgt0);
+
+ /* lvl_shift * wgt0 */
+ res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
+ res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
+
+ const_temp_4x32b = _mm_set1_epi32(temp);
+ off0_4x32b = _mm_set1_epi32(off0);
+
+
+ /* lvl_shift * wgt0 */
+ lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
+ /* lvl_shift * wgt0 + 1 << (shift - 1) */
+ lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
+
+ if(0 == (wd & 7)) /* wd multiple of 8 case */
+ {
+ __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 8)
+ { /* for row =0 ,1,2,3*/
+
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+ /* row = 2 */
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
+ /* row = 3 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
+
+ /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
+ res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+ /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+ src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
+ src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+ res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
+
+ res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+ res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
+
+ /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
+ res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+
+ /* (i4_tmp >> shift) */ /* First 4 pixels */
+ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+ /* (i4_tmp >> shift) */ /* Last 4 pixels */
+ res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
+ res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
+ res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
+ res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
+ res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
+
+ res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b);
+ res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
+ res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b);
+ res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
+ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+ res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b);
+ res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src += 8;
+ pu1_dst += 8;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else /* wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 4)
+ { /* for row =0 ,1,2,3*/
+
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+ /* row = 2 */
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd));
+ /* row = 3 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd));
+
+ /* 2 rows together */
+ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b);
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+
+ /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */
+ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+ res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+ /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+
+ /* (i4_tmp >> shift) */
+ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+ res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
+ res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
+ /* dst row = 1 to 3 */
+ res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
+ res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2);
+ res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
+ dst2 = _mm_cvtsi128_si32(res_temp2_4x32b);
+ dst3 = _mm_cvtsi128_si32(res_temp3_4x32b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+ *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+ *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+ /* To update pointer */
+ pi2_src += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
+* it at the location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
+* offset
+*
+* @param[in] pi2_src
+* Pointer to the source
+*
+* @param[out] pu1_dst
+* Pointer to the destination
+*
+* @param[in] src_strd
+* Source stride
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to the source
+*
+* @param[in] off0
+* offset to be added after rounding and
+*
+* @param[in] shifting
+*
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 shift,
+ WORD32 lvl_shift,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp, wdx2;
+ /* all 128 bit registers are named with a suffix mxnb, where m is the */
+ /* number of n bits packed in the register */
+
+ __m128i src_temp0_8x16b, src_temp1_8x16b;
+ __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
+ __m128i res_temp0_4x32b, res_temp1_4x32b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ temp = 1 << (shift - 1);
+ wdx2 = 2 * wd;
+
+ // seting values in register
+ lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
+ wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+
+ /* lvl_shift * wgt0 */
+ res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
+ res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
+
+ const_temp_4x32b = _mm_set1_epi32(temp);
+ off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
+
+ /* lvl_shift * wgt0 */
+ lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
+ /* lvl_shift * wgt0 + 1 << (shift - 1) */
+ lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
+
+ {
+ if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
+ {
+ __m128i src_temp2_8x16b, src_temp3_8x16b;
+ __m128i res_temp2_4x32b, res_temp3_4x32b;
+ __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 16)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+ /* row = 0 */ /* Next 8 pixels */
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
+
+ /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ res_temp4_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
+ res_temp5_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+ /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+ src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
+ src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b);
+ res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b);
+
+ res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b);
+ res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b);
+
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
+ res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
+
+ /* (i4_tmp >> shift) */
+ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+ /* (i4_tmp >> shift) */
+ res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
+ res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
+ res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
+ res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
+
+ res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
+ res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+ res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b);
+ res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b);
+ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b);
+
+ /* store 16 8-bit output values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
+
+ pi2_src += 16; /* Pointer update */
+ pu1_dst += 16; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+ else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
+ {
+ __m128i res_temp2_4x32b, res_temp3_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
+
+ /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+ res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
+
+ /* (i4_tmp >> shift) */
+ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+ /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
+
+ res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
+ res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
+ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
+
+ pi2_src += 8; /* Pointer update */
+ pu1_dst += 8; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+ else /* 2*wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
+ /* row = 1 */
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
+
+ /* 2 rows together */
+ src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
+
+ /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
+ res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
+ /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
+ src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
+ res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
+
+ /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
+
+ /* (i4_tmp >> shift) */
+ res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+
+ /*i4_tmp = (i4_tmp >> shift) + off0; */
+ res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
+
+ res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp); */
+ res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
+ /* dst row = 1 to 3 */
+ res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
+ /* row = 1 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ pi2_src += 4; /* Pointer update */
+ pu1_dst += 4; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+ pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+ }
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0,
+ WORD32 off0,
+ WORD32 wgt1,
+ WORD32 off1,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
+ __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
+
+#include <assert.h>
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 4 == 0); /* checking assumption*/
+
+ temp = (off0 + off1 + 1) << (shift - 1);
+
+ // seting values in register
+ lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
+ wgt0_8x16b = _mm_set1_epi16(wgt0);
+ lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
+ wgt1_8x16b = _mm_set1_epi16(wgt1);
+
+ /* lvl_shift1 * wgt0 */
+ res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+ res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+ /* lvl_shift2 * wgt1 */
+ res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+ res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+
+ const_temp_4x32b = _mm_set1_epi32(temp);
+
+ /* lvl_shift1 * wgt0 */
+ lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
+ /* lvl_shift2 * wgt1 */
+ lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
+
+ if(0 == (wd & 7)) /* wd multiple of 8 case */
+ {
+ __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+ /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+ res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+ res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
+ /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+ src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+ src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+ src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+ res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
+ res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+ res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
+ res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+ /* (pi2_src[col] + lvl_shift) * wgt */
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
+ res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
+ res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+ /* Next 4 Pixels */
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
+ res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
+ res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
+
+ res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
+ res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+ res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
+
+ pi2_src1 += 8; /* Pointer update */
+ pi2_src2 += 8; /* Pointer update */
+ pu1_dst += 8; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
+
+ } /* outer loop ends */
+ }
+ else /* wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+ /* 2 rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+ /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+ /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+ src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+ /* (pi2_src[col] + lvl_shift) * wgt */
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+ res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
+
+ /* dst row = 1 to 3 */
+ res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
+
+ /* row = 1 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ pi2_src1 += 4; /* Pointer update */
+ pi2_src2 += 4; /* Pointer update */
+ pu1_dst += 4; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
+
+ } /* outer loop ends */
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
+* off1 + 1) << (shift - 1) ) >> shift
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] wgt0
+* weight to be multiplied to source 1
+*
+* @param[in] off0
+* offset 0
+*
+* @param[in] wgt1
+* weight to be multiplied to source 2
+*
+* @param[in] off1
+* offset 1
+*
+* @param[in] shift
+* (14 Bit depth) + log2_weight_denominator
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 wgt0_cb,
+ WORD32 wgt0_cr,
+ WORD32 off0_cb,
+ WORD32 off0_cr,
+ WORD32 wgt1_cb,
+ WORD32 wgt1_cr,
+ WORD32 off1_cb,
+ WORD32 off1_cr,
+ WORD32 shift,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp1, temp2;
+ WORD32 wdx2;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
+ __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
+ temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
+
+ // seting values in register
+ lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
+ wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
+ lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
+ wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
+
+ /* lvl_shift1 * wgt0 */
+ res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+ res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
+ /* lvl_shift2 * wgt1 */
+ res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+ res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
+
+ const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
+ wdx2 = wd * 2;
+
+ /* lvl_shift1 * wgt0 */
+ lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
+ /* lvl_shift2 * wgt1 */
+ lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
+
+ if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
+ {
+ __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+ /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+ res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
+ res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
+ /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+ src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+ src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
+ src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+ res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
+ res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+ res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
+ res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
+
+ /* (pi2_src[col] + lvl_shift) * wgt */
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
+ res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
+ res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+ /* (i4_tmp >> shift) */
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+ /* Next 4 Pixels */
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
+ res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
+ res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
+ res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
+ res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
+
+ res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
+ res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+ res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
+
+ pi2_src1 += 8; /* Pointer update */
+ pi2_src2 += 8; /* Pointer update */
+ pu1_dst += 8; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+
+ } /* outer loop ends */
+ }
+ else /* wdx2 multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
+
+ /* 2 rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+ /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
+ res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
+ res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
+ /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
+ src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
+ src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
+
+ /* Get 32 bit Result */
+ res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+ res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
+ res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
+
+ /* (pi2_src[col] + lvl_shift) * wgt */
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
+ res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
+ res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
+
+ /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
+
+ /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
+ res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
+ res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
+
+ /* (i4_tmp >> shift) */
+ res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
+ res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
+
+ res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
+
+ dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
+
+ /* dst row = 1 to 3 */
+ res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
+
+ /* row = 1 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ pi2_src1 += 4; /* Pointer update */
+ pi2_src2 += 4; /* Pointer update */
+ pu1_dst += 4; /* Pointer update */
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+ }
+ }
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+* >> shift where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source
+*
+* @returns
+*
+* @remarks
+* None
+*
+* Assumption : ht%4 == 0, wd%4 == 0
+* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
+* final result will match even if intermediate precision is in 16 bit.
+*
+*******************************************************************************
+*/
+void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+#if 1
+ {
+ WORD32 row, col, temp;
+ WORD32 shift;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ ASSERT(wd % 4 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ temp = 1 << (shift - 1);
+
+ // seting values in register
+ lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
+ lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
+ const_temp_8x16b = _mm_set1_epi16(temp);
+
+ lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
+ lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
+
+ if(0 == (ht & 3)) /* ht multiple of 4*/
+ {
+ if(0 == (wd & 15)) /* wd multiple of 16 case */
+ {
+ __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+ __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 16)
+ {
+ /*load 8 pixel values */ /* First 8 Values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /*load 8 pixel values */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+ src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+ /* row = 1 */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+ /* row = 2 */
+ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
+ src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+ /*load 8 pixel values */ /* Second 8 Values */
+ /* row = 3 */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
+ src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+ /* (i4_tmp >> shift) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+ src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
+ src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift);
+ src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
+ src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
+
+ /* store four 8-bit output values */ /* 16 8 Values */
+ _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+ _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src1 += 16;
+ pi2_src2 += 16;
+ pu1_dst += 16;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else if(0 == (wd & 7)) /* multiple of 8 case */
+ {
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 8)
+ {
+ /*load 8 pixel values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+ src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src1 += 8;
+ pi2_src2 += 8;
+ pu1_dst += 8;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else /* wd multiple of 4 case*/
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /* Pack two rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
+ src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ /* dst row = 1 to 3 */
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+ src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
+ dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+ *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+ *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+ /* To update pointer */
+ pi2_src1 += 4;
+ pi2_src2 += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ }
+ else /* ht multiple of 2 case and wd multiple of 4 case*/
+ {
+
+ WORD32 dst0, dst1;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wd; col += 4)
+ {
+ /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+
+ /* Pack two rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ /* dst row = 1 to 3 */
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ /* To update pointer */
+ pi2_src1 += 4;
+ pi2_src2 += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
+
+ }
+
+ }
+
+ }
+#else
+ {
+ WORD32 row, col,temp;
+ WORD32 shift;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ ASSERT (wd%4 == 0); /* checking assumption*/
+ ASSERT (ht%4 == 0); /* checking assumption*/
+
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ temp = 1 << (shift - 1);
+
+ // seting values in register
+ lvl_shift1_8x16b = _mm_set1_epi32 (lvl_shift1);
+ lvl_shift2_8x16b = _mm_set1_epi32 (lvl_shift2);
+ const_temp_8x16b = _mm_set1_epi32 (temp);
+
+ lvl_shift1_8x16b = _mm_add_epi32 (lvl_shift1_8x16b, lvl_shift2_8x16b);
+ lvl_shift1_8x16b = _mm_add_epi32 (lvl_shift1_8x16b, const_temp_8x16b);
+
+ if( 0 == (wd & 7)) /* multiple of 8 case */
+ {
+ __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+ __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row +=4)
+ {
+ for(col = 0; col < wd; col +=8)
+ {
+ /*load 4 pixel values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+2*src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+2*src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+3*src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+3*src_strd2));
+
+ /* considering pix. 3:0 by converting 16-into 32 bit */
+ src_temp1_8x16b = _mm_cvtepi16_epi32(src_temp1_8x16b);
+ src_temp2_8x16b = _mm_cvtepi16_epi32(src_temp2_8x16b);
+ /* row = 1 */
+ src_temp3_8x16b = _mm_cvtepi16_epi32(src_temp3_8x16b);
+ src_temp4_8x16b = _mm_cvtepi16_epi32(src_temp4_8x16b);
+ /* row = 2 */
+ src_temp5_8x16b = _mm_cvtepi16_epi32(src_temp5_8x16b);
+ src_temp6_8x16b = _mm_cvtepi16_epi32(src_temp6_8x16b);
+ /* row = 3 */
+ src_temp7_8x16b = _mm_cvtepi16_epi32(src_temp7_8x16b);
+ src_temp8_8x16b = _mm_cvtepi16_epi32(src_temp8_8x16b);
+
+ /* (pi2_src1[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, src_temp8_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi32(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi32(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi32(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi32(src_temp7_8x16b, shift);
+
+ /*load next 4 pixel values */
+ src_temp9_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+4));
+ src_temp10_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+4));
+ /* row = 1 */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+1*src_strd1+4));
+ src_temp12_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+1*src_strd2+4));
+ /* row = 2 */
+ src_temp13_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+2*src_strd1+4));
+ src_temp14_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+2*src_strd2+4));
+ /* row = 3 */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+3*src_strd1+4));
+ src_temp16_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+3*src_strd2+4));
+
+ /* considering pix. 7:4 by converting 16-into 32 bit */
+ src_temp9_8x16b = _mm_cvtepi16_epi32(src_temp9_8x16b);
+ src_temp10_8x16b = _mm_cvtepi16_epi32(src_temp10_8x16b);
+ /* row = 1 */
+ src_temp11_8x16b = _mm_cvtepi16_epi32(src_temp11_8x16b);
+ src_temp12_8x16b = _mm_cvtepi16_epi32(src_temp12_8x16b);
+ /* row = 2 */
+ src_temp13_8x16b = _mm_cvtepi16_epi32(src_temp13_8x16b);
+ src_temp14_8x16b = _mm_cvtepi16_epi32(src_temp14_8x16b);
+ /* row = 3 */
+ src_temp15_8x16b = _mm_cvtepi16_epi32(src_temp15_8x16b);
+ src_temp16_8x16b = _mm_cvtepi16_epi32(src_temp16_8x16b);
+
+ /* (pi2_src1[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp9_8x16b = _mm_add_epi32 (src_temp9_8x16b , lvl_shift1_8x16b);
+ src_temp11_8x16b = _mm_add_epi32 (src_temp11_8x16b, lvl_shift1_8x16b);
+ src_temp13_8x16b = _mm_add_epi32 (src_temp13_8x16b, lvl_shift1_8x16b);
+ src_temp15_8x16b = _mm_add_epi32 (src_temp15_8x16b, lvl_shift1_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp9_8x16b = _mm_add_epi32 (src_temp9_8x16b , src_temp10_8x16b);
+ src_temp11_8x16b = _mm_add_epi32 (src_temp11_8x16b, src_temp12_8x16b);
+ src_temp13_8x16b = _mm_add_epi32 (src_temp13_8x16b, src_temp14_8x16b);
+ src_temp15_8x16b = _mm_add_epi32 (src_temp15_8x16b, src_temp16_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp9_8x16b = _mm_srai_epi32(src_temp9_8x16b , shift);
+ src_temp11_8x16b = _mm_srai_epi32(src_temp11_8x16b, shift);
+ src_temp13_8x16b = _mm_srai_epi32(src_temp13_8x16b, shift);
+ src_temp15_8x16b = _mm_srai_epi32(src_temp15_8x16b, shift);
+
+ src_temp1_8x16b = _mm_packs_epi32 (src_temp1_8x16b, src_temp9_8x16b);
+ src_temp3_8x16b = _mm_packs_epi32 (src_temp3_8x16b, src_temp11_8x16b);
+ src_temp5_8x16b = _mm_packs_epi32 (src_temp5_8x16b, src_temp13_8x16b);
+ src_temp7_8x16b = _mm_packs_epi32 (src_temp7_8x16b, src_temp15_8x16b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16 (src_temp1_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16 (src_temp3_8x16b, src_temp3_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16 (src_temp5_8x16b, src_temp5_8x16b);
+ src_temp7_8x16b = _mm_packus_epi16 (src_temp7_8x16b, src_temp7_8x16b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp3_8x16b); /* row = 2*/
+ _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp5_8x16b); /* row = 1*/
+ _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp7_8x16b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src1 += 8;
+ pi2_src2 += 8;
+ pu1_dst += 8;
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1- wd + 4*src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2- wd + 4*src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4*dst_strd; /* Pointer update */
+
+ }
+ }
+ else /* wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row +=4)
+ {
+ for(col = 0; col < wd; col +=4)
+ {
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1));
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp2_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2));
+
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+2*src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+2*src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i*)(pi2_src1+3*src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i*)(pi2_src2+3*src_strd2));
+
+ /* considering pix. 4:0 by converting 16-into 32 bit */
+ src_temp1_8x16b = _mm_cvtepi16_epi32(src_temp1_8x16b);
+ src_temp2_8x16b = _mm_cvtepi16_epi32(src_temp2_8x16b);
+ /* row = 1 */
+ src_temp3_8x16b = _mm_cvtepi16_epi32(src_temp3_8x16b);
+ src_temp4_8x16b = _mm_cvtepi16_epi32(src_temp4_8x16b);
+ /* row = 2 */
+ src_temp5_8x16b = _mm_cvtepi16_epi32(src_temp5_8x16b);
+ src_temp6_8x16b = _mm_cvtepi16_epi32(src_temp6_8x16b);
+ /* row = 3 */
+ src_temp7_8x16b = _mm_cvtepi16_epi32(src_temp7_8x16b);
+ src_temp8_8x16b = _mm_cvtepi16_epi32(src_temp8_8x16b);
+
+ /* (pi2_src1[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_add_epi32 (src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_add_epi32 (src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_add_epi32 (src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_add_epi32 (src_temp7_8x16b, src_temp8_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi32(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi32(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi32(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi32(src_temp7_8x16b, shift);
+
+ src_temp1_8x16b = _mm_packs_epi32 (src_temp1_8x16b, src_temp3_8x16b);
+ src_temp5_8x16b = _mm_packs_epi32 (src_temp5_8x16b, src_temp7_8x16b);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16 (src_temp1_8x16b, src_temp5_8x16b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ /* dst row = 1 to 3 */
+ src_temp2_8x16b = _mm_shuffle_epi32 (src_temp1_8x16b, 1);
+ src_temp3_8x16b = _mm_shuffle_epi32 (src_temp1_8x16b, 2);
+ src_temp4_8x16b = _mm_shuffle_epi32 (src_temp1_8x16b, 3);
+
+ /* store four 8-bit output values */
+ *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ dst2 = _mm_cvtsi128_si32(src_temp3_8x16b);
+ dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
+ *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
+ *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
+
+ pi2_src1 += 4;
+ pi2_src2 += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wd + 4*src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wd + 4*src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wd + 4*dst_strd; /* Pointer update */
+
+ }
+ }
+
+ }
+#endif
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
+* pi2_src2 and stores it at location pointed by pi2_dst
+*
+* @par Description:
+* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
+* >> shift where shift = 15 - BitDepth
+*
+* @param[in] pi2_src1
+* Pointer to source 1
+*
+* @param[in] pi2_src2
+* Pointer to source 2
+*
+* @param[out] pu1_dst
+* Pointer to destination
+*
+* @param[in] src_strd1
+* Source stride 1
+*
+* @param[in] src_strd2
+* Source stride 2
+*
+* @param[in] dst_strd
+* Destination stride
+*
+* @param[in] lvl_shift1
+* added before shift and offset
+*
+* @param[in] lvl_shift2
+* added before shift and offset
+*
+* @param[in] ht
+* height of the source
+*
+* @param[in] wd
+* width of the source (each colour component)
+*
+* @returns
+*
+* @remarks
+* None
+*
+* Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0.
+* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
+* final result will match even if intermediate precision is in 16 bit.
+*******************************************************************************
+*/
+
+void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1,
+ WORD16 *pi2_src2,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd1,
+ WORD32 src_strd2,
+ WORD32 dst_strd,
+ WORD32 lvl_shift1,
+ WORD32 lvl_shift2,
+ WORD32 ht,
+ WORD32 wd)
+{
+ WORD32 row, col, temp;
+ WORD32 shift, wdx2;
+
+ __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
+ __m128i lvl_shift1_8x16b;
+ __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
+
+ ASSERT(wd % 2 == 0); /* checking assumption*/
+ ASSERT(ht % 2 == 0); /* checking assumption*/
+ UNUSED(lvl_shift1);
+ UNUSED(lvl_shift2);
+ shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
+ temp = 1 << (shift - 1);
+ wdx2 = wd * 2;
+
+ // seting values in register
+ lvl_shift1_8x16b = _mm_set1_epi16(temp);
+
+ if(0 == (ht & 3)) /* ht multiple of 4 case */
+ {
+ if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
+ {
+ __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+ __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 16)
+ {
+ /*load 8 pixel values */ /* First 8 Values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /*load 8 pixel values */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+ src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+ /* row = 1 */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+ /* row = 2 */
+ src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
+ src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+ /*load 8 pixel values */ /* Second 8 Values */
+ /* row = 3 */
+ src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
+ src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+ src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
+ src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
+
+ /* (i4_tmp >> shift) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+ src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
+ src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+ src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+ /* (i4_tmp >> shift) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
+ src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift);
+ src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift);
+
+ /* store four 8-bit output values */ /* First 8 Values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
+ src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b);
+ src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b);
+
+ /* store four 8-bit output values */ /* Second 8 Values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src1 += 16;
+ pi2_src2 += 16;
+ pu1_dst += 16;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+ {
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 8 pixel values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+ src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+ src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+ src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
+
+ /* To update pointer */
+ pi2_src1 += 8;
+ pi2_src2 += 8;
+ pu1_dst += 8;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else /* 2*wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1, dst2, dst3;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 4)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+ /* row = 2 */
+ src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
+ src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
+ /* row = 3 */
+ src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
+ src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
+
+ /* Pack two rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+ src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
+ src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ /* dst row = 1 to 3 */
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+ src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
+ dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
+
+ /* row = 1 to row = 3 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+ *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
+ *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
+
+ /* To update pointer */
+ pi2_src1 += 4;
+ pi2_src2 += 4;
+ pu1_dst += 4;
+
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
+
+ }
+ }
+ }
+ else /* ht multiple of 2 case */
+ {
+ if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
+ {
+ __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 16)
+ {
+ /*load 8 pixel values */ /* First 8 Values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+
+ /*load 8 pixel values */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
+ src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
+ /* row = 1 */
+ src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
+ src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
+
+ /* (i4_tmp >> shift) */ /* First 8 Values */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
+ src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+
+ /* (i4_tmp >> shift) */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
+ src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
+
+ /* store four 8-bit output values */ /* First 8 Values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
+ src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
+ src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
+
+ /* store four 8-bit output values */ /* Second 8 Values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
+
+ /* To update pointer */
+ pi2_src1 += 16;
+ pi2_src2 += 16;
+ pu1_dst += 16;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else if(0 == (wdx2 & 7)) /* multiple of 8 case */
+ {
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 8)
+ {
+ /*load 8 pixel values */
+ src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
+ src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
+
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+ src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
+
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+ src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
+
+ /* store four 8-bit output values */
+ _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
+ _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/
+
+ /* To update pointer */
+ pi2_src1 += 8;
+ pi2_src2 += 8;
+ pu1_dst += 8;
+
+ } /* inner loop ends here(8-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+
+ }
+ }
+ else /* 2*wd multiple of 4 case */
+ {
+ WORD32 dst0, dst1;
+ /* outer for loop starts from here */
+ for(row = 0; row < ht; row += 2)
+ {
+ for(col = 0; col < wdx2; col += 4)
+ {
+ /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
+ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
+ src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
+ /* row = 1 */
+ src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
+ src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
+
+ /* Pack two rows together */
+ src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
+ src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
+
+ /* (pi2_src1[col] + pi2_src2[col]) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
+ /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
+ src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
+
+ /* (i4_tmp >> shift) */
+ src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
+ /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
+ src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
+
+ dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
+ /* dst row = 1 */
+ src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
+
+ /* store four 8-bit output values */
+ *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
+
+ dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
+ /* row = 1 */
+ *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
+
+ /* To update pointer */
+ pi2_src1 += 4;
+ pi2_src2 += 4;
+ pu1_dst += 4;
+ } /* inner loop ends here(4-output values in single iteration) */
+
+ pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
+ pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
+ pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
+
+ }
+ }
+ }
+}
diff --git a/decoder.arm.mk b/decoder.arm.mk
new file mode 100644
index 0000000..903822d
--- /dev/null
+++ b/decoder.arm.mk
@@ -0,0 +1,88 @@
+libhevcd_inc_dir_arm += $(LOCAL_PATH)/decoder/arm
+libhevcd_inc_dir_arm += $(LOCAL_PATH)/common/arm
+
+libhevcd_srcs_c_arm += decoder/arm/ihevcd_function_selector.c
+libhevcd_srcs_c_arm += decoder/arm/ihevcd_function_selector_noneon.c
+libhevcd_cflags_arm += -DDISABLE_NEONINTR -DARM -DARMGCC
+
+LOCAL_ARM_MODE := arm
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+libhevcd_srcs_c_arm += decoder/arm/ihevcd_function_selector_a9q.c
+libhevcd_srcs_c_arm += common/arm/ihevc_intra_ref_substitution_a9q.c
+libhevcd_srcs_c_arm += common/arm/ihevc_intra_pred_filters_neon_intr.c
+libhevcd_srcs_c_arm += common/arm/ihevc_weighted_pred_neon_intr.c
+
+libhevcd_srcs_asm_arm += common/arm/ihevc_mem_fns.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_itrans_recon_32x32.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_weighted_pred_bi_default.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_weighted_pred_bi.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_weighted_pred_uni.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_deblk_luma_horz.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_deblk_luma_vert.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_deblk_chroma_vert.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_deblk_chroma_horz.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_band_offset_luma.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_band_offset_chroma.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class0.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class0_chroma.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class1.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class1_chroma.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class2.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class2_chroma.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class3.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_sao_edge_offset_class3_chroma.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_luma_horz_w16out.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_filters_luma_horz.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_filters_luma_vert.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_horz.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_horz_w16out.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_vert.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_vert_w16out.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_vert_w16inp.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_luma_copy_w16out.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_luma_copy.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_copy.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_inter_pred_chroma_copy_w16out.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_itrans_recon_4x4_ttype1.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_itrans_recon_4x4.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_itrans_recon_8x8.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_itrans_recon_16x16.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_planar.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_dc.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_horz.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_ver.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_mode2.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_mode_18_34.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_planar.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_horz.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_mode2.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_mode_27_to_33.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_mode_18_34.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_vert.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_dc.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_intra_pred_luma_mode_3_to_9.s
+libhevcd_srcs_asm_arm += common/arm/ihevc_padding.s
+
+libhevcd_srcs_asm_arm += decoder/arm/ihevcd_itrans_recon_dc_luma.s
+libhevcd_srcs_asm_arm += decoder/arm/ihevcd_itrans_recon_dc_chroma.s
+libhevcd_srcs_asm_arm += decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s
+libhevcd_srcs_asm_arm += decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s
+libhevcd_srcs_asm_arm += decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s
+libhevcd_cflags_arm += -DDEFAULT_ARCH=D_ARCH_ARM_A9Q
+else
+libhevcd_cflags_arm += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON
+endif
+
+LOCAL_SRC_FILES_arm += $(libhevcd_srcs_c_arm) $(libhevcd_srcs_asm_arm)
+LOCAL_C_INCLUDES_arm += $(libhevcd_inc_dir_arm)
+LOCAL_CFLAGS_arm += $(libhevcd_cflags_arm)
diff --git a/decoder.arm64.mk b/decoder.arm64.mk
new file mode 100644
index 0000000..8714aaf
--- /dev/null
+++ b/decoder.arm64.mk
@@ -0,0 +1,97 @@
+libhevcd_cflags_arm64 += -DARMV8
+libhevcd_cflags_arm64 += -DDISABLE_NEONINTR -DARM -DARMGCC
+
+libhevcd_inc_dir_arm64 += $(LOCAL_PATH)/decoder/arm
+libhevcd_inc_dir_arm64 += $(LOCAL_PATH)/common/arm
+libhevcd_inc_dir_arm64 += $(LOCAL_PATH)/decoder/arm64
+libhevcd_inc_dir_arm64 += $(LOCAL_PATH)/common/arm64
+
+libhevcd_srcs_c_arm64 += decoder/arm/ihevcd_function_selector.c
+libhevcd_srcs_c_arm64 += decoder/arm/ihevcd_function_selector_noneon.c
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+libhevcd_srcs_c_arm64 += decoder/arm64/ihevcd_function_selector_av8.c
+
+libhevcd_srcs_c_arm64 += common/arm/ihevc_intra_pred_filters_neon_intr.c
+libhevcd_srcs_c_arm64 += common/arm/ihevc_weighted_pred_neon_intr.c
+
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_mem_fns.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_itrans_recon_32x32.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_weighted_pred_bi_default.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_weighted_pred_bi.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_weighted_pred_uni.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_deblk_luma_horz.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_deblk_luma_vert.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_deblk_chroma_vert.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_deblk_chroma_horz.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_band_offset_luma.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_band_offset_chroma.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class0.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class0_chroma.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class1.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class1_chroma.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class2.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class3.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_luma_horz_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_filters_luma_horz.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_filters_luma_vert.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_horz.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_vert.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_luma_copy_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_luma_copy.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_copy.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_itrans_recon_4x4_ttype1.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_itrans_recon_4x4.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_itrans_recon_8x8.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_itrans_recon_16x16.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_planar.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_dc.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_horz.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_ver.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_mode2.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_planar.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_horz.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_mode2.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_mode_18_34.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_vert.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_dc.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
+libhevcd_srcs_asm_arm64 += common/arm64/ihevc_padding.s
+
+
+
+libhevcd_srcs_asm_arm64 += decoder/arm64/ihevcd_itrans_recon_dc_luma.s
+libhevcd_srcs_asm_arm64 += decoder/arm64/ihevcd_itrans_recon_dc_chroma.s
+libhevcd_srcs_asm_arm64 += decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s
+libhevcd_srcs_asm_arm64 += decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s
+libhevcd_srcs_asm_arm64 += decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
+
+libhevcd_cflags_arm += -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC
+else
+libhevcd_cflags_arm64 += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON
+endif
+
+
+
+
+LOCAL_SRC_FILES_arm64 += $(libhevcd_srcs_c_arm64) $(libhevcd_srcs_asm_arm64)
+LOCAL_C_INCLUDES_arm64 += $(libhevcd_inc_dir_arm64)
+LOCAL_CFLAGS_arm64 += $(libhevcd_cflags_arm64)
diff --git a/decoder.mips.mk b/decoder.mips.mk
new file mode 100644
index 0000000..2aecc09
--- /dev/null
+++ b/decoder.mips.mk
@@ -0,0 +1,13 @@
+libhevcd_inc_dir_mips += $(LOCAL_PATH)/decoder/mips
+libhevcd_inc_dir_mips += $(LOCAL_PATH)/common/mips
+
+libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector.c
+libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector_mips_generic.c
+
+
+LOCAL_SRC_FILES_mips += $(libhevcd_srcs_c_mips) $(libhevcd_srcs_asm_mips)
+LOCAL_C_INCLUDES_mips += $(libhevcd_inc_dir_mips)
+LOCAL_CFLAGS_mips += $(libhevcd_cflags_mips)
+
+
+
diff --git a/decoder.mips64.mk b/decoder.mips64.mk
new file mode 100644
index 0000000..5ac515e
--- /dev/null
+++ b/decoder.mips64.mk
@@ -0,0 +1,10 @@
+libhevcd_inc_dir_mips += $(LOCAL_PATH)/decoder/mips
+libhevcd_inc_dir_mips += $(LOCAL_PATH)/common/mips
+
+libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector.c
+libhevcd_srcs_c_mips += decoder/mips/ihevcd_function_selector_mips_generic.c
+
+LOCAL_SRC_FILES_mips64 += $(libhevcd_srcs_c_mips64) $(libhevcd_srcs_asm_mips64)
+LOCAL_C_INCLUDES_mips64 += $(libhevcd_inc_dir_mips64)
+LOCAL_CFLAGS_mips64 += $(libhevcd_cflags_mips64)
+
diff --git a/decoder.mk b/decoder.mk
new file mode 100644
index 0000000..38e3654
--- /dev/null
+++ b/decoder.mk
@@ -0,0 +1,96 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+libhevc_source_dir := $(LOCAL_PATH)
+
+## Arch-common settings
+LOCAL_MODULE := libhevcdec
+#LOCAL_32_BIT_ONLY := true
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+
+LOCAL_CFLAGS += -D_LIB -DMULTICORE -fPIC
+#TODO -O3 is throwing up an error in aarch64 while linking
+LOCAL_CFLAGS += -O2 -DHM_10DOT0 -DANDROID
+
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/decoder $(LOCAL_PATH)/common
+
+libhevcd_srcs_c += common/ihevc_quant_tables.c
+libhevcd_srcs_c += common/ihevc_inter_pred_filters.c
+libhevcd_srcs_c += common/ihevc_weighted_pred.c
+libhevcd_srcs_c += common/ihevc_padding.c
+libhevcd_srcs_c += common/ihevc_deblk_edge_filter.c
+libhevcd_srcs_c += common/ihevc_deblk_tables.c
+libhevcd_srcs_c += common/ihevc_cabac_tables.c
+libhevcd_srcs_c += common/ihevc_common_tables.c
+libhevcd_srcs_c += common/ihevc_intra_pred_filters.c
+libhevcd_srcs_c += common/ihevc_chroma_intra_pred_filters.c
+libhevcd_srcs_c += common/ihevc_mem_fns.c
+libhevcd_srcs_c += common/ihevc_sao.c
+libhevcd_srcs_c += common/ihevc_trans_tables.c
+libhevcd_srcs_c += common/ihevc_recon.c
+libhevcd_srcs_c += common/ihevc_itrans.c
+libhevcd_srcs_c += common/ihevc_itrans_recon.c
+libhevcd_srcs_c += common/ihevc_iquant_recon.c
+libhevcd_srcs_c += common/ihevc_iquant_itrans_recon.c
+libhevcd_srcs_c += common/ihevc_itrans_recon_32x32.c
+libhevcd_srcs_c += common/ihevc_itrans_recon_16x16.c
+libhevcd_srcs_c += common/ihevc_itrans_recon_8x8.c
+libhevcd_srcs_c += common/ihevc_chroma_itrans_recon.c
+libhevcd_srcs_c += common/ihevc_chroma_iquant_recon.c
+libhevcd_srcs_c += common/ihevc_chroma_iquant_itrans_recon.c
+libhevcd_srcs_c += common/ihevc_chroma_recon.c
+libhevcd_srcs_c += common/ihevc_chroma_itrans_recon_16x16.c
+libhevcd_srcs_c += common/ihevc_chroma_itrans_recon_8x8.c
+libhevcd_srcs_c += common/ihevc_buf_mgr.c
+libhevcd_srcs_c += common/ihevc_disp_mgr.c
+libhevcd_srcs_c += common/ihevc_dpb_mgr.c
+libhevcd_srcs_c += common/ithread.c
+
+
+
+libhevcd_srcs_c += decoder/ihevcd_version.c
+libhevcd_srcs_c += decoder/ihevcd_trace.c
+libhevcd_srcs_c += decoder/ihevcd_api.c
+libhevcd_srcs_c += decoder/ihevcd_decode.c
+libhevcd_srcs_c += decoder/ihevcd_nal.c
+libhevcd_srcs_c += decoder/ihevcd_bitstream.c
+libhevcd_srcs_c += decoder/ihevcd_parse_headers.c
+libhevcd_srcs_c += decoder/ihevcd_parse_slice_header.c
+libhevcd_srcs_c += decoder/ihevcd_parse_slice.c
+libhevcd_srcs_c += decoder/ihevcd_parse_residual.c
+libhevcd_srcs_c += decoder/ihevcd_cabac.c
+libhevcd_srcs_c += decoder/ihevcd_intra_pred_mode_prediction.c
+libhevcd_srcs_c += decoder/ihevcd_process_slice.c
+libhevcd_srcs_c += decoder/ihevcd_utils.c
+libhevcd_srcs_c += decoder/ihevcd_job_queue.c
+libhevcd_srcs_c += decoder/ihevcd_ref_list.c
+libhevcd_srcs_c += decoder/ihevcd_get_mv.c
+libhevcd_srcs_c += decoder/ihevcd_mv_pred.c
+libhevcd_srcs_c += decoder/ihevcd_mv_merge.c
+libhevcd_srcs_c += decoder/ihevcd_iquant_itrans_recon_ctb.c
+libhevcd_srcs_c += decoder/ihevcd_itrans_recon_dc.c
+libhevcd_srcs_c += decoder/ihevcd_common_tables.c
+libhevcd_srcs_c += decoder/ihevcd_boundary_strength.c
+libhevcd_srcs_c += decoder/ihevcd_deblk.c
+libhevcd_srcs_c += decoder/ihevcd_inter_pred.c
+libhevcd_srcs_c += decoder/ihevcd_sao.c
+libhevcd_srcs_c += decoder/ihevcd_ilf_padding.c
+libhevcd_srcs_c += decoder/ihevcd_debug.c
+libhevcd_srcs_c += decoder/ihevcd_ittiam_logo.c
+libhevcd_srcs_c += decoder/ihevcd_statistics.c
+libhevcd_srcs_c += decoder/ihevcd_fmt_conv.c
+
+LOCAL_SRC_FILES := $(libhevcd_srcs_c) $(libhevcd_srcs_asm)
+
+
+# Load the arch-specific settings
+include $(LOCAL_PATH)/decoder.arm.mk
+include $(LOCAL_PATH)/decoder.arm64.mk
+include $(LOCAL_PATH)/decoder.x86.mk
+include $(LOCAL_PATH)/decoder.x86_64.mk
+include $(LOCAL_PATH)/decoder.mips.mk
+include $(LOCAL_PATH)/decoder.mips64.mk
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/decoder.x86.mk b/decoder.x86.mk
new file mode 100644
index 0000000..287ef3a
--- /dev/null
+++ b/decoder.x86.mk
@@ -0,0 +1,44 @@
+libhevcd_cflags_x86 += -DX86 -DDISABLE_AVX2 -m32 -msse4.2 -mno-avx -DDEFAULT_ARCH=D_ARCH_X86_SSE42
+
+libhevcd_inc_dir_x86 += $(LOCAL_PATH)/decoder/x86
+libhevcd_inc_dir_x86 += $(LOCAL_PATH)/common/x86
+
+libhevcd_srcs_c_x86 += decoder/x86/ihevcd_function_selector.c
+libhevcd_srcs_c_x86 += decoder/x86/ihevcd_function_selector_generic.c
+libhevcd_srcs_c_x86 += decoder/x86/ihevcd_function_selector_ssse3.c
+libhevcd_srcs_c_x86 += decoder/x86/ihevcd_function_selector_sse42.c
+
+
+libhevcd_srcs_c_x86 += common/x86/ihevc_inter_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_weighted_pred_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_itrans_recon_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_sao_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_deblk_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_padding_ssse3_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_mem_fns_ssse3_intr.c
+libhevcd_srcs_c_x86 += decoder/x86/ihevcd_fmt_conv_ssse3_intr.c
+libhevcd_srcs_c_x86 += decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c
+
+
+
+libhevcd_srcs_c_x86 += common/x86/ihevc_inter_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_weighted_pred_sse42_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_16x16_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86 += common/x86/ihevc_32x32_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86 += decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
+
+libhevcd_srcs_c_x86 += common/x86/ihevc_tables_x86_intr.c
+
+LOCAL_SRC_FILES_x86 += $(libhevcd_srcs_c_x86) $(libhevcd_srcs_asm_x86)
+LOCAL_C_INCLUDES_x86 += $(libhevcd_inc_dir_x86)
+LOCAL_CFLAGS_x86 += $(libhevcd_cflags_x86)
+
+
+
diff --git a/decoder.x86_64.mk b/decoder.x86_64.mk
new file mode 100644
index 0000000..7c53b87
--- /dev/null
+++ b/decoder.x86_64.mk
@@ -0,0 +1,44 @@
+libhevcd_cflags_x86_64 += -DX86 -DDISABLE_AVX2 -m64 -msse4.2 -mno-avx -DDEFAULT_ARCH=D_ARCH_X86_SSE42
+
+libhevcd_inc_dir_x86_64 += $(LOCAL_PATH)/decoder/x86
+libhevcd_inc_dir_x86_64 += $(LOCAL_PATH)/common/x86
+
+libhevcd_srcs_c_x86_64 += decoder/x86/ihevcd_function_selector.c
+libhevcd_srcs_c_x86_64 += decoder/x86/ihevcd_function_selector_generic.c
+libhevcd_srcs_c_x86_64 += decoder/x86/ihevcd_function_selector_ssse3.c
+libhevcd_srcs_c_x86_64 += decoder/x86/ihevcd_function_selector_sse42.c
+
+
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_inter_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_weighted_pred_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_itrans_recon_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_sao_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_deblk_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_padding_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_mem_fns_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += decoder/x86/ihevcd_fmt_conv_ssse3_intr.c
+libhevcd_srcs_c_x86_64 += decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c
+
+
+
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_inter_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_weighted_pred_sse42_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_16x16_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_32x32_itrans_recon_sse42_intr.c
+libhevcd_srcs_c_x86_64 += decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
+
+libhevcd_srcs_c_x86_64 += common/x86/ihevc_tables_x86_intr.c
+
+LOCAL_SRC_FILES_x86_64 += $(libhevcd_srcs_c_x86_64) $(libhevcd_srcs_asm_x86_64)
+LOCAL_C_INCLUDES_x86_64 += $(libhevcd_inc_dir_x86_64)
+LOCAL_CFLAGS_x86_64 += $(libhevcd_cflags_x86_64)
+
+
+
diff --git a/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s b/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s
new file mode 100644
index 0000000..c1d09ed
--- /dev/null
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s
@@ -0,0 +1,203 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@* ihevcd_fmt_conv_420sp_to_420p.s
+@*
+@* @brief
+@* contains function definitions for format conversions
+@*
+@* @author
+@* ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+
+
+
+
+
+
+
+
+.text
+
+
+
+
+
+@/*****************************************************************************
+@* *
+@* Function Name : neon_copy_yuv420sp_to_yuv420p() *
+@* *
+@* Description : This function conversts the image from YUV420sP color *
+@* space to 420SP color space(UV interleaved). *
+@* *
+@* Arguments : R0 pu1_src_y *
+@* R1 pu1_src_uv *
+@* R2 pu1_dest_y *
+@* R3 pu1_dest_u *
+@* [R13 #40] pu1_dest_v *
+@* [R13 #44] u2_width *
+@* [R13 #48] u2_height *
+@* [R13 #52] u2_stridey *
+@* [R13 #56] u2_strideuv *
+@* [R13 #60] u2_dest_stridey *
+@* [R13 #64] u2_dest_strideuv *
+@* [R13 #68] is_u_first *
+@* [R13 #72] disable_luma_copy *
+@* *
+@* Values Returned : None *
+@* *
+@* Register Usage : R0 - R14 *
+@* *
+@* Stack Usage : 40 Bytes *
+@* *
+@* Interruptibility : Interruptible *
+@* *
+@* Known Limitations *
+@* Assumptions: Image Width: Assumed to be multiple of 2 and *
+@* Image Height: Assumed to be even. *
+@* *
+@* Revision History : *
+@* DD MM YYYY Author(s) Changes (Describe the changes made) *
+@* 16 05 2012 Naveen SR draft *
+@* *
+@*****************************************************************************/
+
+.globl ihevcd_fmt_conv_420sp_to_420p_a9q
+
+.type ihevcd_fmt_conv_420sp_to_420p_a9q, %function
+
+ihevcd_fmt_conv_420sp_to_420p_a9q:
+ STMFD sp!,{r4-r12, lr}
+
+ LDR r5,[sp,#60] @//Load u2_dest_stridey
+@ LDR r6,[sp,#56] @//Load u2_strideuv
+ LDR r7,[sp,#52] @//Load u2_stridey
+ LDR r8,[sp,#44] @//Load u2_width
+ LDR r9,[sp,#48] @//Load u2_height
+
+ SUB r10,r7,r8 @// Src Y increment
+ SUB r11,r5,r8 @// Dst Y increment
+
+ LDR r5,[sp,#72] @//Load disable_luma_copy flag
+ CMP r5,#0 @//skip luma if disable_luma_copy is non-zero
+ BNE uv_copy_start
+
+ @/* Copy Y */
+
+ MOV r4,r9 @// Copying height
+y_row_loop:
+ MOV r6,r8 @// Copying width
+
+y_col_loop:
+
+ SUB r6,r6,#16
+ vld1.8 {d0,d1},[r0]!
+ vst1.8 {d0,d1},[r2]!
+ CMP r6,#16
+ BGE y_col_loop
+ CMP r6,#0
+ BEQ y_col_loop_end
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ RSB r6,r6,#16
+ SUB r0,r0,r6
+ SUB r2,r2,r6
+ vld1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r2]!
+
+y_col_loop_end:
+ ADD r0, r0, r10
+ ADD r2, r2, r11
+ SUBS r4, r4, #1
+ BGT y_row_loop
+
+
+ @/* Copy UV */
+uv_copy_start:
+
+ LDR r5,[sp,#64] @//Load u2_dest_strideuv
+ LDR r7,[sp,#56] @//Load u2_strideuv
+
+ MOV r9,r9,LSR #1 @// height/2
+@ MOV r8,r8,LSR #1 @// Width/2
+
+ SUB r10,r7,r8 @// Src UV increment
+ MOV r11,r8,LSR #1
+ SUB r11,r5,r11 @// Dst U and V increment
+
+ LDR r5,[sp,#40] @//Load pu1_dest_v
+
+ LDR r4,[sp,#68] @//Load is_u_first_flag
+ CMP r4,#0 @//Swap U and V dest if is_u_first_flag is zero
+ MOVEQ r4,r5
+ MOVEQ r5,r3
+ MOVEQ r3,r4
+
+ MOV r4,r9 @// Copying height
+uv_row_loop:
+ MOV r6,r8 @// Copying width
+
+uv_col_loop:
+
+ SUB r6,r6,#16
+
+ PLD [r1,#128]
+ vld2.8 {d0,d1},[r1]!
+ VST1.8 D0,[r3]!
+ VST1.8 D1,[r5]!
+ CMP r6,#16
+ BGE uv_col_loop
+ CMP r6,#0
+ BEQ uv_col_loop_end
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ RSB r6,r6,#16
+ SUB r1,r1,r6
+ SUB r3,r3,r6,LSR #1
+ SUB r5,r5,r6,LSR #1
+ vld2.8 {d0,d1}, [r1]!
+ VST1.8 D0, [r3]!
+ VST1.8 D1, [r5]!
+uv_col_loop_end:
+ ADD r1, r1, r10
+ ADD r3, r3, r11
+ ADD r5, r5, r11
+ SUBS r4, r4, #1
+ BGT uv_row_loop
+
+exit:
+ LDMFD sp!,{r4-r12, pc}
+
+
+
+
+
+
diff --git a/decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s b/decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s
new file mode 100644
index 0000000..38886ba
--- /dev/null
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s
@@ -0,0 +1,198 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@* ihevcd_fmt_conv_420sp_to_420sp.s
+@*
+@* @brief
+@* contains function definitions for format conversions
+@*
+@* @author
+@* ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+ .equ DO1STROUNDING, 0
+
+ @ ARM
+ @
+ @ PRESERVE8
+
+.text
+.p2align 2
+
+
+
+
+
+@/*****************************************************************************
+@* *
+@* Function Name : ihevcd_fmt_conv_420sp_to_420sp() *
+@* *
+@* Description : This function conversts the image from YUV420SP color *
+@* space to 420SP color space(UV interleaved). *
+@* *
+@* Arguments : R0 pu1_y *
+@* R1 pu1_uv *
+@* R2 pu1_dest_y *
+@* R3 pu1_dest_uv *
+@* [R13 #40] u2_width *
+@* [R13 #44] u2_height *
+@* [R13 #48] u2_stridey *
+@* [R13 #52] u2_stridechroma *
+@* [R13 #56] u2_dest_stridey *
+@* [R13 #60] u2_dest_stridechroma *
+@* *
+@* Values Returned : None *
+@* *
+@* Register Usage : R0 - R14 *
+@* *
+@* Stack Usage : 40 Bytes *
+@* *
+@* Interruptibility : Interruptible *
+@* *
+@* Known Limitations *
+@* Assumptions: Image Width: Assumed to be multiple of 2 and *
+@* Image Height: Assumed to be even. *
+@* *
+@* Revision History : *
+@* DD MM YYYY Author(s) Changes (Describe the changes made) *
+@* 16 05 2012 Naveen SR draft *
+@* *
+@*****************************************************************************/
+
+ .global ihevcd_fmt_conv_420sp_to_420sp_a9q
+.type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
+ihevcd_fmt_conv_420sp_to_420sp_a9q:
+
+ STMFD sp!,{r4-r12, lr}
+
+
+ LDR r5,[sp,#56] @//Load u2_dest_stridey
+
+ LDR r7,[sp,#48] @//Load u2_stridey
+ LDR r8,[sp,#40] @//Load u2_width
+ LDR r9,[sp,#44] @//Load u2_height
+
+ SUB r10,r7,r8 @// Src Y increment
+ SUB r11,r5,r8 @// Dst Y increment
+
+ @/* Copy Y */
+
+ MOV r4,r9 @// Copying height
+y_row_loop:
+ MOV r6,r8 @// Copying width
+
+y_col_loop:
+ PLD [r0, #128]
+ SUB r6,r6,#32
+ VLD1.8 D0,[r0]!
+ VLD1.8 D1,[r0]!
+ VLD1.8 D2,[r0]!
+ VLD1.8 D3,[r0]!
+ VST1.8 D0,[R2]!
+ VST1.8 D1,[R2]!
+ VST1.8 D2,[R2]!
+ VST1.8 D3,[R2]!
+ CMP r6,#32
+ BGE y_col_loop
+ CMP r6,#0
+ BEQ y_col_loop_end
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ RSB r6,r6,#32
+ SUB r0,r0,r6
+ SUB R2,R2,r6
+ VLD1.8 D0,[r0]!
+ VLD1.8 D1,[r0]!
+ VLD1.8 D2,[r0]!
+ VLD1.8 D3,[r0]!
+ VST1.8 D0,[R2]!
+ VST1.8 D1,[R2]!
+ VST1.8 D2,[R2]!
+ VST1.8 D3,[R2]!
+
+y_col_loop_end:
+ ADD r0, r0, r10
+ ADD R2, R2, r11
+ SUBS r4, r4, #1
+ BGT y_row_loop
+
+
+
+ @/* Copy UV */
+
+ LDR r5,[sp,#60] @//Load u2_dest_stridechroma
+ LDR r7,[sp,#52] @//Load u2_stridechroma
+
+ MOV r9,r9,LSR #1 @// height/2
+@ MOV r8,r8,LSR #1 @// Width/2
+
+ MOV R2,R3 @pu1_dest_uv
+
+ SUB r10,r7,r8 @// Src UV increment
+ SUB r11,r5,r8 @// Dst UV increment
+
+ MOV r4,r9 @// Copying height
+uv_row_loop:
+ MOV r6,r8 @// Copying width
+
+uv_col_loop:
+
+ PLD [r1, #128]
+ SUB r6,r6,#16
+ VLD1.8 D0,[r1]!
+ VLD1.8 D1,[r1]!
+ VST1.8 D0,[R2]!
+ VST1.8 D1,[R2]!
+ CMP r6,#16
+ BGE uv_col_loop
+ CMP r6,#0
+ BEQ u_col_loop_end
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ RSB r6,r6,#16
+ SUB r1,r1,r6
+ SUB R2,R2,r6
+ VLD1.8 D0, [r1]!
+ VLD1.8 D1, [r1]!
+ VST1.8 D0, [R2]!
+ VST1.8 D1, [R2]!
+
+u_col_loop_end:
+ ADD r1, r1, r10
+ ADD R2, R2, r11
+ SUBS r4, r4, #1
+ BGT uv_row_loop
+
+exit:
+ LDMFD sp!,{r4-r12, pc}
+
+
+ .section .note.GNU-stack,"",%progbits
+
diff --git a/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s
new file mode 100644
index 0000000..a9a75cb
--- /dev/null
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -0,0 +1,454 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@* ihevcd_fmt_conv_420sp_to_rgba8888.s
+@*
+@* @brief
+@* contains function definitions for format conversions
+@*
+@* @author
+@* ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+ .equ DO1STROUNDING, 0
+
+ @ ARM
+ @
+ @ PRESERVE8
+
+.text
+.p2align 2
+
+
+
+
+@/*****************************************************************************
+@* *
+@* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() *
+@* *
+@* Description : This function conversts the image from YUV422 color *
+@* space to RGB888 color space. The function can be *
+@* invoked at the MB level. *
+@* *
+@* Arguments : R0 pubY *
+@* R1 pubUV *
+@* R2 pusRGB *
+@* R3 pusRGB *
+@* [R13 #40] usHeight *
+@* [R13 #44] usWidth *
+@* [R13 #48] usStrideY *
+@* [R13 #52] usStrideU *
+@* [R13 #56] usStrideV *
+@* [R13 #60] usStrideRGB *
+@* *
+@* Values Returned : None *
+@* *
+@* Register Usage : R0 - R14 *
+@* *
+@* Stack Usage : 40 Bytes *
+@* *
+@* Interruptibility : Interruptible *
+@* *
+@* Known Limitations *
+@* Assumptions: Image Width: Assumed to be multiple of 16 and *
+@* greater than or equal to 16 *
+@* Image Height: Assumed to be even. *
+@* *
+@* Revision History : *
+@* DD MM YYYY Author(s) Changes (Describe the changes made) *
+@* 07 06 2010 Varshita Draft *
+@* 07 06 2010 Naveen Kr T Completed *
+@* 05 08 2013 Naveen K P Modified for HEVC *
+@*****************************************************************************/
+ .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
+.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
+ihevcd_fmt_conv_420sp_to_rgba8888_a9q:
+
+ @// push the registers on the stack
+ STMFD SP!,{R4-R12,LR}
+
+
+ @//R0 - Y PTR
+ @//R1 - UV PTR
+ @//R2 - RGB PTR
+ @//R3 - RGB PTR
+ @//R4 - PIC WIDTH
+ @//R5 - PIC HT
+ @//R6 - STRIDE Y
+ @//R7 - STRIDE U
+ @//R8 - STRIDE V
+ @//R9 - STRIDE RGB
+
+ @//ONE ROW PROCESSING AT A TIME
+
+ @//THE FOUR CONSTANTS ARE:
+ @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
+
+ @PLD [R0]
+ @PLD [R1]
+ @PLD [R2]
+
+
+ @/* can be loaded from a defined const type */
+ MOVW R10,#0x3311
+ VMOV.16 D0[0],R10 @//C1
+
+ MOVW R10,#0xF379
+ VMOV.16 D0[1],R10 @//C2
+
+ MOVW R10,#0xE5F8
+ VMOV.16 D0[2],R10 @//C3
+
+ MOVW R10,#0x4092
+ VMOV.16 D0[3],R10 @//C4
+
+ @//LOAD CONSTANT 128 INTO A CORTEX REGISTER
+ MOV R10,#128
+ VDUP.8 D1,R10
+
+ @//D0 HAS C1-C2-C3-C4
+ @// load other parameters from stack
+ LDR R5,[sp,#40]
+ @LDR R4,[sp,#44]
+ LDR R6,[sp,#44]
+ LDR R7,[sp,#48]
+ @LDR R8,[sp,#52]
+ LDR R9,[sp,#52]
+
+ @// calculate offsets, offset = stride - width
+ SUB R10,R6,R3 @// luma offset
+ SUB R11,R7,R3
+ @, LSR #1 @// u offset
+ @SUB R12,R8,R3, LSR #1 @// v offset
+ SUB R14,R9,R3 @// rgb offset in pixels
+
+ @// calculate height loop count
+ MOV R5,R5, LSR #1 @// height_cnt = height / 16
+
+ @// create next row pointers for rgb and luma data
+ ADD R7,R0,R6 @// luma_next_row = luma + luma_stride
+ ADD R8,R2,R9,LSL #2 @// rgb_next_row = rgb + rgb_stride
+
+LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
+
+ @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
+ VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF UV
+ @//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V
+
+ @// calculate width loop count
+ MOV R6,R3, LSR #4 @// width_cnt = width / 16
+
+ @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+ @//LOAD VALUES OF Y 8-BIT VALUES
+ VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+ @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+ VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+ @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+ SUBS R6,R6,#1
+ BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
+ @VMOV.I8 Q1,#128
+ VUZP.8 D2,D3
+
+
+ @//NEED TO SUBTRACT (U-128) AND (V-128)
+ @//(D2-D1),(D3-D1)
+ VSUBL.U8 Q2,D2,D1 @//(U-128)
+ VSUBL.U8 Q3,D3,D1 @//(V-128)
+
+ @//LOAD VALUES OF U&V for next row
+ VLD1.8 {D2,D3},[R1]! @//LOAD 8 VALUES OF U
+ @//VLD1.8 {D3},[R2]! @//LOAD 8 VALUES OF V
+
+ @PLD [R0]
+ PLD [R1]
+
+ @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+ VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B
+ VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B
+
+ VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R
+ VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R
+
+ VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G
+ VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3
+ VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G
+ VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3
+
+ @//NARROW RIGHT SHIFT BY 13 FOR R&B
+ VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ @//Q4 - WEIGHT FOR B
+
+ @//NARROW RIGHT SHIFT BY 13 FOR R&B
+ VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ @//Q5 - WEIGHT FOR R
+
+ @//NARROW RIGHT SHIFT BY 13 FOR G
+ VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ @//Q6 - WEIGHT FOR G
+
+ VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B
+ VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R
+ VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G
+
+ VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B
+ VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R
+ VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G
+
+ VQMOVUN.S16 D14,Q7
+ VQMOVUN.S16 D15,Q9
+ VQMOVUN.S16 D16,Q8
+ VMOV.I8 D17,#0
+
+ VZIP.8 D14,D15
+ VZIP.8 D16,D17
+ VZIP.16 Q7,Q8
+
+
+ VQMOVUN.S16 D20,Q10
+ VQMOVUN.S16 D21,Q12
+ VQMOVUN.S16 D22,Q11
+ VMOV.I8 D23,#0
+
+ VZIP.8 D20,D21
+ VZIP.8 D22,D23
+ VZIP.16 Q10,Q11
+
+ VZIP.32 Q7,Q10
+ VZIP.32 Q8,Q11
+
+ VST1.32 D14,[R2]!
+ VST1.32 D15,[R2]!
+ VST1.32 D20,[R2]!
+ VST1.32 D21,[R2]!
+ VST1.32 D16,[R2]!
+ VST1.32 D17,[R2]!
+ VST1.32 D22,[R2]!
+ VST1.32 D23,[R2]!
+
+ @//D14-D20 - TOALLY HAVE 16 VALUES
+ @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+ VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B
+ VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R
+ VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G
+
+ VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B
+ VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R
+ VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G
+
+ @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+ @//LOAD VALUES OF Y 8-BIT VALUES
+ VLD2.8 {D30,D31},[R0]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+ @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+ VLD2.8 {D28,D29},[R7]! @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+ @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+ PLD [R0]
+ PLD [R7]
+
+ VQMOVUN.S16 D14,Q7
+ VQMOVUN.S16 D15,Q9
+ VQMOVUN.S16 D16,Q8
+ VMOV.I8 D17,#0
+
+ VZIP.8 D14,D15
+ VZIP.8 D16,D17
+ VZIP.16 Q7,Q8
+
+
+ VQMOVUN.S16 D20,Q10
+ VQMOVUN.S16 D21,Q12
+ VQMOVUN.S16 D22,Q11
+ VMOV.I8 D23,#0
+
+ VZIP.8 D20,D21
+ VZIP.8 D22,D23
+ VZIP.16 Q10,Q11
+
+ VZIP.32 Q7,Q10
+ VZIP.32 Q8,Q11
+
+ VST1.32 D14,[R8]!
+ VST1.32 D15,[R8]!
+ VST1.32 D20,[R8]!
+ VST1.32 D21,[R8]!
+ VST1.32 D16,[R8]!
+ VST1.32 D17,[R8]!
+ VST1.32 D22,[R8]!
+ VST1.32 D23,[R8]!
+
+ SUBS R6,R6,#1 @// width_cnt -= 1
+ BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
+ @VMOV.I8 Q1,#128
+ VUZP.8 D2,D3
+
+
+ @//NEED TO SUBTRACT (U-128) AND (V-128)
+ @//(D2-D1),(D3-D1)
+ VSUBL.U8 Q2,D2,D1 @//(U-128)
+ VSUBL.U8 Q3,D3,D1 @//(V-128)
+
+
+ @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+ VMULL.S16 Q4,D4,D0[3] @//(U-128)*C4 FOR B
+ VMULL.S16 Q5,D5,D0[3] @//(U-128)*C4 FOR B
+
+ VMULL.S16 Q10,D6,D0[0] @//(V-128)*C1 FOR R
+ VMULL.S16 Q11,D7,D0[0] @//(V-128)*C1 FOR R
+
+ VMULL.S16 Q6,D4,D0[1] @//(U-128)*C2 FOR G
+ VMLAL.S16 Q6,D6,D0[2] @//Q6 = (U-128)*C2 + (V-128)*C3
+ VMULL.S16 Q7,D5,D0[1] @//(U-128)*C2 FOR G
+ VMLAL.S16 Q7,D7,D0[2] @//Q7 = (U-128)*C2 + (V-128)*C3
+
+ @//NARROW RIGHT SHIFT BY 13 FOR R&B
+ VQSHRN.S32 D8,Q4,#13 @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ VQSHRN.S32 D9,Q5,#13 @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ @//Q4 - WEIGHT FOR B
+
+ @//NARROW RIGHT SHIFT BY 13 FOR R&B
+ VQSHRN.S32 D10,Q10,#13 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ VQSHRN.S32 D11,Q11,#13 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ @//Q5 - WEIGHT FOR R
+
+ @//NARROW RIGHT SHIFT BY 13 FOR G
+ VQSHRN.S32 D12,Q6,#13 @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ VQSHRN.S32 D13,Q7,#13 @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ @//Q6 - WEIGHT FOR G
+
+ VADDW.U8 Q7,Q4,D30 @//Q7 - HAS Y + B
+ VADDW.U8 Q8,Q5,D30 @//Q8 - HAS Y + R
+ VADDW.U8 Q9,Q6,D30 @//Q9 - HAS Y + G
+
+ VADDW.U8 Q10,Q4,D31 @//Q10 - HAS Y + B
+ VADDW.U8 Q11,Q5,D31 @//Q11 - HAS Y + R
+ VADDW.U8 Q12,Q6,D31 @//Q12 - HAS Y + G
+
+ VQMOVUN.S16 D14,Q7
+ VQMOVUN.S16 D15,Q9
+ VQMOVUN.S16 D16,Q8
+ VMOV.I8 D17,#0
+
+ VZIP.8 D14,D15
+ VZIP.8 D16,D17
+ VZIP.16 Q7,Q8
+
+
+ VQMOVUN.S16 D20,Q10
+ VQMOVUN.S16 D21,Q12
+ VQMOVUN.S16 D22,Q11
+ VMOV.I8 D23,#0
+
+ VZIP.8 D20,D21
+ VZIP.8 D22,D23
+ VZIP.16 Q10,Q11
+
+ VZIP.32 Q7,Q10
+ VZIP.32 Q8,Q11
+
+ VST1.32 D14,[R2]!
+ VST1.32 D15,[R2]!
+ VST1.32 D20,[R2]!
+ VST1.32 D21,[R2]!
+ VST1.32 D16,[R2]!
+ VST1.32 D17,[R2]!
+ VST1.32 D22,[R2]!
+ VST1.32 D23,[R2]!
+
+ @//D14-D20 - TOALLY HAVE 16 VALUES
+ @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+ VADDW.U8 Q7,Q4,D28 @//Q7 - HAS Y + B
+ VADDW.U8 Q8,Q5,D28 @//Q2 - HAS Y + R
+ VADDW.U8 Q9,Q6,D28 @//Q3 - HAS Y + G
+
+ VADDW.U8 Q10,Q4,D29 @//Q10 - HAS Y + B
+ VADDW.U8 Q11,Q5,D29 @//Q11 - HAS Y + R
+ VADDW.U8 Q12,Q6,D29 @//Q12 - HAS Y + G
+
+
+ VQMOVUN.S16 D14,Q7
+ VQMOVUN.S16 D15,Q9
+ VQMOVUN.S16 D16,Q8
+ VMOV.I8 D17,#0
+
+ VZIP.8 D14,D15
+ VZIP.8 D16,D17
+ VZIP.16 Q7,Q8
+
+
+ VQMOVUN.S16 D20,Q10
+ VQMOVUN.S16 D21,Q12
+ VQMOVUN.S16 D22,Q11
+ VMOV.I8 D23,#0
+
+ VZIP.8 D20,D21
+ VZIP.8 D22,D23
+ VZIP.16 Q10,Q11
+
+ VZIP.32 Q7,Q10
+ VZIP.32 Q8,Q11
+
+ VST1.32 D14,[R8]!
+ VST1.32 D15,[R8]!
+ VST1.32 D20,[R8]!
+ VST1.32 D21,[R8]!
+ VST1.32 D16,[R8]!
+ VST1.32 D17,[R8]!
+ VST1.32 D22,[R8]!
+ VST1.32 D23,[R8]!
+
+ @// Adjust the address pointers
+ ADD R0,R7,R10 @// luma = luma_next + offset
+ ADD R2,R8,R14,LSL #2 @// rgb = rgb_next + offset
+
+ ADD R7,R0,R3 @// luma_next = luma + width
+ ADD R8,R2,R3,LSL #2 @// rgb_next_row = rgb + width
+
+ ADD R1,R1,R11 @// adjust u pointer
+ @ADD R2,R2,R12 @// adjust v pointer
+
+ ADD R7,R7,R10 @// luma_next = luma + width + offset (because of register crunch)
+ ADD R8,R8,R14,LSL #2 @// rgb_next_row = rgb + width + offset
+
+ SUBS R5,R5,#1 @// height_cnt -= 1
+
+ BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
+
+ @//POP THE REGISTERS
+ LDMFD SP!,{R4-R12,PC}
+
+
+
+
+ .section .note.GNU-stack,"",%progbits
+
diff --git a/decoder/arm/ihevcd_function_selector.c b/decoder/arm/ihevcd_function_selector.c
new file mode 100644
index 0000000..66c7d4d
--- /dev/null
+++ b/decoder/arm/ihevcd_function_selector.c
@@ -0,0 +1,135 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector.c
+*
+* @brief
+* Contains functions to initialize function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_neonintr(codec_t *ps_codec);
+void ihevcd_init_function_ptr_noneon(codec_t *ps_codec);
+void ihevcd_init_function_ptr_a9q(codec_t *ps_codec);
+void ihevcd_init_function_ptr_av8(codec_t *ps_codec);
+void ihevcd_init_function_ptr(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+
+#ifndef ARMV8
+ switch(ps_codec->e_processor_arch)
+ {
+#ifndef DISABLE_NEONINTR
+ case ARCH_ARM_NEONINTR:
+ ihevcd_init_function_ptr_neonintr(ps_codec);
+ break;
+#endif
+ case ARCH_ARM_NONEON:
+ ihevcd_init_function_ptr_noneon(ps_codec);
+ break;
+ default:
+ case ARCH_ARM_A5:
+ case ARCH_ARM_A7:
+ case ARCH_ARM_A9:
+ case ARCH_ARM_A15:
+ case ARCH_ARM_A9Q:
+#ifndef DISABLE_NEON
+ ihevcd_init_function_ptr_a9q(ps_codec);
+#else
+ ihevcd_init_function_ptr_noneon(ps_codec);
+#endif
+ break;
+ }
+ switch(ps_codec->e_processor_soc)
+ {
+
+ case SOC_HISI_37X:
+#ifndef DISABLE_NEON
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp_a9q;
+#endif
+ break;
+ case SOC_GENERIC:
+ default:
+ break;
+ }
+#else
+ switch(ps_codec->e_processor_arch)
+ {
+ case ARCH_ARM_NONEON:
+ ihevcd_init_function_ptr_noneon(ps_codec);
+ break;
+ case ARCH_ARMV8_GENERIC:
+ default:
+ ihevcd_init_function_ptr_av8(ps_codec);
+ break;
+ }
+#endif
+}
+
+void ihevcd_init_arch(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+#ifdef DEFAULT_ARCH
+#if DEFAULT_ARCH == D_ARCH_ARM_NONEON
+ ps_codec->e_processor_arch = ARCH_ARM_NONEON;
+#elif DEFAULT_ARCH == D_ARCH_ARMV8_GENERIC
+ ps_codec->e_processor_arch = ARCH_ARMV8_GENERIC;
+#elif DEFAULT_ARCH == D_ARCH_ARM_NEONINTR
+ ps_codec->e_processor_arch = ARCH_ARM_NEONINTR;
+#else
+ ps_codec->e_processor_arch = ARCH_ARM_A9Q;
+#endif
+#else
+ ps_codec->e_processor_arch = ARCH_ARM_A9Q;
+#endif
+}
diff --git a/decoder/arm/ihevcd_function_selector_a9q.c b/decoder/arm/ihevcd_function_selector_a9q.c
new file mode 100644
index 0000000..ea5b8c0
--- /dev/null
+++ b/decoder/arm/ihevcd_function_selector_a9q.c
@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector_a9q.c
+*
+* @brief
+* Contains functions to initialize a9q function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_a9q(codec_t *ps_codec)
+{
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr = &ihevc_deblk_chroma_horz_a9q;
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr = &ihevc_deblk_chroma_vert_a9q;
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr = &ihevc_deblk_luma_vert_a9q;
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr = &ihevc_deblk_luma_horz_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr = &ihevc_inter_pred_chroma_copy_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr = &ihevc_inter_pred_chroma_copy_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr = &ihevc_inter_pred_chroma_horz_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr = &ihevc_inter_pred_chroma_horz_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr = &ihevc_inter_pred_chroma_vert_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr = &ihevc_inter_pred_chroma_vert_w16inp_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr = &ihevc_inter_pred_luma_horz_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr = &ihevc_inter_pred_luma_vert_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr = &ihevc_inter_pred_luma_vert_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr = &ihevc_inter_pred_luma_vert_w16inp_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr = &ihevc_inter_pred_luma_copy_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr = &ihevc_inter_pred_luma_copy_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr = &ihevc_inter_pred_luma_horz_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_luma_vert_w16inp_w16out_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr = &ihevc_intra_pred_chroma_ref_substitution_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr = &ihevc_intra_pred_luma_ref_substitution_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr = &ihevc_intra_pred_luma_ref_subst_all_avlble;
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr = &ihevc_intra_pred_ref_filtering_neonintr;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr = &ihevc_intra_pred_chroma_dc_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr = &ihevc_intra_pred_chroma_horz_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr = &ihevc_intra_pred_chroma_mode2_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr = &ihevc_intra_pred_chroma_mode_18_34_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr = &ihevc_intra_pred_chroma_mode_27_to_33_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr = &ihevc_intra_pred_chroma_mode_3_to_9_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr = &ihevc_intra_pred_chroma_planar_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr = &ihevc_intra_pred_chroma_ver_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr = &ihevc_intra_pred_chroma_mode_11_to_17_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr = &ihevc_intra_pred_chroma_mode_19_to_25_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr = &ihevc_intra_pred_luma_mode_11_to_17_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr = &ihevc_intra_pred_luma_mode_19_to_25_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr = &ihevc_intra_pred_luma_dc_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr = &ihevc_intra_pred_luma_horz_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr = &ihevc_intra_pred_luma_mode2_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr = &ihevc_intra_pred_luma_mode_18_34_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr = &ihevc_intra_pred_luma_mode_27_to_33_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr = &ihevc_intra_pred_luma_mode_3_to_9_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr = &ihevc_intra_pred_luma_planar_a9q;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr = &ihevc_intra_pred_luma_ver_a9q;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr = &ihevc_itrans_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_fptr = &ihevc_itrans_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_8x8_fptr = &ihevc_itrans_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_16x16_fptr = &ihevc_itrans_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_32x32_fptr = &ihevc_itrans_32x32;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr = &ihevc_itrans_recon_4x4_ttype1_a9q;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr = &ihevc_itrans_recon_4x4_a9q;
+ ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr = &ihevc_itrans_recon_8x8_a9q;
+ ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr = &ihevc_itrans_recon_16x16_a9q;
+ ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr = &ihevc_itrans_recon_32x32_a9q;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr = &ihevc_chroma_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr = &ihevc_chroma_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr = &ihevc_chroma_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr = &ihevc_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_recon_4x4_fptr = &ihevc_recon_4x4;
+ ps_codec->s_func_selector.ihevc_recon_8x8_fptr = &ihevc_recon_8x8;
+ ps_codec->s_func_selector.ihevc_recon_16x16_fptr = &ihevc_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_32x32_fptr = &ihevc_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr = &ihevc_chroma_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr = &ihevc_chroma_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr = &ihevc_chroma_recon_16x16;
+ ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr = &ihevc_memcpy_mul_8_a9q;
+ ps_codec->s_func_selector.ihevc_memcpy_fptr = &ihevc_memcpy_a9q;
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr = &ihevc_memset_mul_8_a9q;
+ ps_codec->s_func_selector.ihevc_memset_fptr = &ihevc_memset_a9q;
+ ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr = &ihevc_memset_16bit_mul_8_a9q;
+ ps_codec->s_func_selector.ihevc_memset_16bit_fptr = &ihevc_memset_16bit_a9q;
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr = &ihevc_pad_left_luma_a9q;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr = &ihevc_pad_left_chroma_a9q;
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr = &ihevc_pad_right_luma_a9q;
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr = &ihevc_pad_right_chroma_a9q;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr = &ihevc_weighted_pred_bi_a9q;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr = &ihevc_weighted_pred_bi_default_a9q;
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr = &ihevc_weighted_pred_uni_a9q;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr = &ihevc_weighted_pred_chroma_bi_neonintr;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr = &ihevc_weighted_pred_chroma_bi_default_neonintr;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr = &ihevc_weighted_pred_chroma_uni_neonintr;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr = &ihevc_sao_band_offset_luma_a9q;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr = &ihevc_sao_band_offset_chroma_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr = &ihevc_sao_edge_offset_class0_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr = &ihevc_sao_edge_offset_class0_chroma_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr = &ihevc_sao_edge_offset_class1_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr = &ihevc_sao_edge_offset_class1_chroma_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr = &ihevc_sao_edge_offset_class2_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr = &ihevc_sao_edge_offset_class2_chroma_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr = &ihevc_sao_edge_offset_class3_a9q;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr = &ihevc_sao_edge_offset_class3_chroma_a9q;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr = &ihevcd_fmt_conv_420sp_to_rgba8888_a9q;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr = &ihevcd_fmt_conv_420sp_to_rgb565;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr = &ihevcd_fmt_conv_420sp_to_420p_a9q;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr = &ihevcd_itrans_recon_dc_luma_a9q;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr = &ihevcd_itrans_recon_dc_chroma_a9q;
+}
diff --git a/decoder/arm/ihevcd_function_selector_noneon.c b/decoder/arm/ihevcd_function_selector_noneon.c
new file mode 100644
index 0000000..b5c9f6a
--- /dev/null
+++ b/decoder/arm/ihevcd_function_selector_noneon.c
@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector_noneon.c
+*
+* @brief
+* Contains functions to initialize noneon function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_noneon(codec_t *ps_codec)
+{
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr = &ihevc_deblk_chroma_horz;
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr = &ihevc_deblk_chroma_vert;
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr = &ihevc_deblk_luma_vert;
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr = &ihevc_deblk_luma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr = &ihevc_inter_pred_chroma_copy;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr = &ihevc_inter_pred_chroma_copy_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr = &ihevc_inter_pred_chroma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr = &ihevc_inter_pred_chroma_horz_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr = &ihevc_inter_pred_chroma_vert;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr = &ihevc_inter_pred_chroma_vert_w16inp;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16inp_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr = &ihevc_inter_pred_luma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr = &ihevc_inter_pred_luma_vert;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr = &ihevc_inter_pred_luma_vert_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr = &ihevc_inter_pred_luma_vert_w16inp;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr = &ihevc_inter_pred_luma_copy;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr = &ihevc_inter_pred_luma_copy_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr = &ihevc_inter_pred_luma_horz_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_luma_vert_w16inp_w16out;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr = &ihevc_intra_pred_chroma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr = &ihevc_intra_pred_luma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr = &ihevc_intra_pred_luma_ref_subst_all_avlble;
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr = &ihevc_intra_pred_ref_filtering;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr = &ihevc_intra_pred_chroma_dc;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr = &ihevc_intra_pred_chroma_horz;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr = &ihevc_intra_pred_chroma_mode2;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr = &ihevc_intra_pred_chroma_mode_18_34;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr = &ihevc_intra_pred_chroma_mode_27_to_33;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr = &ihevc_intra_pred_chroma_mode_3_to_9;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr = &ihevc_intra_pred_chroma_planar;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr = &ihevc_intra_pred_chroma_ver;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr = &ihevc_intra_pred_chroma_mode_11_to_17;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr = &ihevc_intra_pred_chroma_mode_19_to_25;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr = &ihevc_intra_pred_luma_mode_11_to_17;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr = &ihevc_intra_pred_luma_mode_19_to_25;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr = &ihevc_intra_pred_luma_dc;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr = &ihevc_intra_pred_luma_horz;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr = &ihevc_intra_pred_luma_mode2;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr = &ihevc_intra_pred_luma_mode_18_34;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr = &ihevc_intra_pred_luma_mode_27_to_33;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr = &ihevc_intra_pred_luma_mode_3_to_9;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr = &ihevc_intra_pred_luma_planar;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr = &ihevc_intra_pred_luma_ver;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr = &ihevc_itrans_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_fptr = &ihevc_itrans_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_8x8_fptr = &ihevc_itrans_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_16x16_fptr = &ihevc_itrans_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_32x32_fptr = &ihevc_itrans_32x32;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr = &ihevc_itrans_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr = &ihevc_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr = &ihevc_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr = &ihevc_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr = &ihevc_itrans_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr = &ihevc_chroma_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr = &ihevc_chroma_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr = &ihevc_chroma_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr = &ihevc_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_recon_4x4_fptr = &ihevc_recon_4x4;
+ ps_codec->s_func_selector.ihevc_recon_8x8_fptr = &ihevc_recon_8x8;
+ ps_codec->s_func_selector.ihevc_recon_16x16_fptr = &ihevc_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_32x32_fptr = &ihevc_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr = &ihevc_chroma_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr = &ihevc_chroma_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr = &ihevc_chroma_recon_16x16;
+ ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr = &ihevc_memcpy_mul_8;
+ ps_codec->s_func_selector.ihevc_memcpy_fptr = &ihevc_memcpy;
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr = &ihevc_memset_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_fptr = &ihevc_memset;
+ ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr = &ihevc_memset_16bit_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_16bit_fptr = &ihevc_memset_16bit;
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr = &ihevc_pad_left_luma;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr = &ihevc_pad_left_chroma;
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr = &ihevc_pad_right_luma;
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr = &ihevc_pad_right_chroma;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr = &ihevc_weighted_pred_bi;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr = &ihevc_weighted_pred_bi_default;
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr = &ihevc_weighted_pred_uni;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr = &ihevc_weighted_pred_chroma_bi;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr = &ihevc_weighted_pred_chroma_bi_default;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr = &ihevc_weighted_pred_chroma_uni;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr = &ihevc_sao_band_offset_luma;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr = &ihevc_sao_band_offset_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr = &ihevc_sao_edge_offset_class0;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr = &ihevc_sao_edge_offset_class0_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr = &ihevc_sao_edge_offset_class1;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr = &ihevc_sao_edge_offset_class1_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr = &ihevc_sao_edge_offset_class2;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr = &ihevc_sao_edge_offset_class2_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr = &ihevc_sao_edge_offset_class3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr = &ihevc_sao_edge_offset_class3_chroma;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr = &ihevcd_fmt_conv_420sp_to_rgba8888;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr = &ihevcd_fmt_conv_420sp_to_rgb565;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr = &ihevcd_fmt_conv_420sp_to_420p;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr = &ihevcd_itrans_recon_dc_luma;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr = &ihevcd_itrans_recon_dc_chroma;
+}
diff --git a/decoder/arm/ihevcd_itrans_recon_dc_chroma.s b/decoder/arm/ihevcd_itrans_recon_dc_chroma.s
new file mode 100644
index 0000000..6732ce0
--- /dev/null
+++ b/decoder/arm/ihevcd_itrans_recon_dc_chroma.s
@@ -0,0 +1,193 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@* ihevcd_itrans_recon_dc_chroma.s
+@*
+@* @brief
+@* contains function definitions itrans and recon for dc only case
+@*
+@* @author
+@* ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+
+.text
+
+
+.globl ihevcd_itrans_recon_dc_chroma_a9q
+
+.type ihevcd_itrans_recon_dc_chroma_a9q, %function
+
+ihevcd_itrans_recon_dc_chroma_a9q:
+
+@void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
+@ uword8 *pu1_dst,
+@ word32 pred_strd,
+@ word32 dst_strd,
+@ word32 log2_trans_size,
+@ word16 i2_coeff_value)
+
+@r0:pu1_pred
+@r1:pu1_dest
+@r2:pred_strd
+@r3:dst_strd
+
+
+
+ push {r0-r11,lr}
+ ldr r4,[sp,#0x34] @loads log2_trans_size
+ ldr r5,[sp,#0x38] @ loads i2_coeff_value
+
+ mov r10,#1
+ lsl r4,r10,r4 @ trans_size = (1 << log2_trans_size)@
+ mov r6,#64 @ 1 << (shift1 - 1)@
+ mov r7,#2048 @ 1<<(shift2-1)
+
+ add r8,r6,r5,lsl #6
+ ssat r8,#16,r8,asr #7
+ add r5,r7,r8,lsl #6
+ ssat r6,#16,r5,asr #12
+ mov r9,r4
+ mov r8,r4
+
+ @ r6 has the dc_value
+ @ r4 has the trans_size value
+ @ r8 has the row value
+ @ r9 has the col value
+ vdup.s16 q0,r6
+ cmp r4,#4
+ beq row_loop_4chroma
+
+
+row_loop_chroma:
+ mov r9,r4
+
+
+col_loop_chroma:
+
+ mov r7,r0
+ vld2.8 {d2,d3},[r7],r2
+ vld2.8 {d4,d5},[r7],r2
+ vld2.8 {d6,d7},[r7],r2
+ vld2.8 {d8,d9},[r7],r2
+
+ vld2.8 {d10,d11},[r7],r2
+ vld2.8 {d12,d13},[r7],r2
+ vld2.8 {d14,d15},[r7],r2
+ vld2.8 {d16,d17},[r7]
+
+ add r0,r0,#16
+
+
+ vaddw.u8 q15,q0,d2
+ vaddw.u8 q14,q0,d4
+ vaddw.u8 q13,q0,d6
+ vaddw.u8 q12,q0,d8
+ vaddw.u8 q11,q0,d10
+ vaddw.u8 q10,q0,d12
+ vaddw.u8 q9,q0,d14
+
+
+ mov r11,r1
+ vqmovun.s16 d2,q15
+ vqmovun.s16 d4,q14
+ vqmovun.s16 d6,q13
+ vqmovun.s16 d8,q12
+
+ vaddw.u8 q15,q0,d16
+
+ vqmovun.s16 d10,q11
+ vqmovun.s16 d12,q10
+ vqmovun.s16 d14,q9
+ vqmovun.s16 d16,q15
+
+ vst2.8 {d2,d3},[r11],r3
+ vst2.8 {d4,d5},[r11],r3
+ vst2.8 {d6,d7},[r11],r3
+ vst2.8 {d8,d9},[r11],r3
+
+ vst2.8 {d10,d11},[r11],r3
+ vst2.8 {d12,d13},[r11],r3
+ vst2.8 {d14,d15},[r11],r3
+ vst2.8 {d16,d17},[r11]
+
+ add r1,r1,#16
+
+ subs r9,r9,#8
+ bgt col_loop_chroma
+
+ subs r8,r8,#8
+
+ add r0,r0,r2,lsl #3
+ add r1,r1,r3,lsl #3
+ sub r0,r0,r4,lsl #1
+ sub r1,r1,r4,lsl #1
+ bgt row_loop_chroma
+ b end_loops_chroma
+
+
+row_loop_4chroma:
+ mov r9,r10
+
+
+col_loop_4chroma:
+
+
+ vld2.8 {d2,d3},[r0],r2
+ vld2.8 {d4,d5},[r0],r2
+ vld2.8 {d6,d7},[r0],r2
+ vld2.8 {d8,d9},[r0]
+
+
+
+
+ vaddw.u8 q15,q0,d2
+ vaddw.u8 q14,q0,d4
+ vaddw.u8 q13,q0,d6
+ vaddw.u8 q12,q0,d8
+
+
+
+ vqmovun.s16 d2,q15
+ vqmovun.s16 d4,q14
+ vqmovun.s16 d6,q13
+ vqmovun.s16 d8,q12
+
+
+ vzip.8 d2,d3
+ vzip.8 d4,d5
+ vzip.8 d6,d7
+ vzip.8 d8,d9
+
+ vst1.u32 {d2},[r1],r3
+ vst1.u32 {d4},[r1],r3
+ vst1.u32 {d6},[r1],r3
+ vst1.u32 {d8},[r1]
+
+end_loops_chroma:
+ pop {r0-r11,pc}
+
+
diff --git a/decoder/arm/ihevcd_itrans_recon_dc_luma.s b/decoder/arm/ihevcd_itrans_recon_dc_luma.s
new file mode 100644
index 0000000..8aee84c
--- /dev/null
+++ b/decoder/arm/ihevcd_itrans_recon_dc_luma.s
@@ -0,0 +1,193 @@
+@/*****************************************************************************
+@*
+@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+@*
+@* Licensed under the Apache License, Version 2.0 (the "License");
+@* you may not use this file except in compliance with the License.
+@* You may obtain a copy of the License at:
+@*
+@* http://www.apache.org/licenses/LICENSE-2.0
+@*
+@* Unless required by applicable law or agreed to in writing, software
+@* distributed under the License is distributed on an "AS IS" BASIS,
+@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@* See the License for the specific language governing permissions and
+@* limitations under the License.
+@*
+@*****************************************************************************/
+@/**
+@/*******************************************************************************
+@* @file
+@* ihevcd_itrans_recon_dc_luma.s
+@*
+@* @brief
+@* contains function definitions itrans and recon for dc only case
+@*
+@* @author
+@* ittiam
+@*
+@* @par list of functions:
+@*
+@*
+@* @remarks
+@* none
+@*
+@*******************************************************************************/
+
+.text
+
+
+
+.globl ihevcd_itrans_recon_dc_luma_a9q
+
+.type ihevcd_itrans_recon_dc_luma_a9q, %function
+
+ihevcd_itrans_recon_dc_luma_a9q:
+
+@void ihevcd_itrans_recon_dc_luma(uword8 *pu1_pred,
+@ uword8 *pu1_dst,
+@ word32 pred_strd,
+@ word32 dst_strd,
+@ word32 log2_trans_size,
+@ word16 i2_coeff_value)
+
+@r0:pu1_pred
+@r1:pu1_dest
+@r2:pred_strd
+@r3:dst_strd
+
+
+
+ push {r0-r11,lr}
+ ldr r4,[sp,#0x34] @loads log2_trans_size
+ ldr r5,[sp,#0x38] @ loads i2_coeff_value
+
+ mov r10,#1
+ lsl r4,r10,r4 @ trans_size = (1 << log2_trans_size)@
+ mov r6,#64 @ 1 << (shift1 - 1)@
+ mov r7,#2048 @ 1<<(shift2-1)
+
+ add r8,r6,r5,lsl #6
+ ssat r8,#16,r8,asr #7
+ add r5,r7,r8,lsl #6
+ ssat r6,#16,r5,asr #12
+ mov r9,r4
+ mov r8,r4
+
+ @ r6 has the dc_value
+ @ r4 has the trans_size value
+ @ r8 has the row value
+ @ r9 has the col value
+ vdup.s16 q0,r6
+ cmp r4,#4
+ beq row_loop_4
+
+
+row_loop:
+ mov r9,r4
+
+
+col_loop:
+
+ mov r7,r0
+ vld1.8 d2,[r7],r2
+ vld1.8 d3,[r7],r2
+ vld1.8 d4,[r7],r2
+ vld1.8 d5,[r7],r2
+
+ vld1.8 d6,[r7],r2
+ vld1.8 d7,[r7],r2
+ vld1.8 d8,[r7],r2
+ vld1.8 d9,[r7]
+
+ add r0,r0,#8
+
+
+ vaddw.u8 q15,q0,d2
+ vaddw.u8 q14,q0,d3
+ vaddw.u8 q13,q0,d4
+ vaddw.u8 q12,q0,d5
+ vaddw.u8 q11,q0,d6
+ vaddw.u8 q10,q0,d7
+ vaddw.u8 q9,q0,d8
+ vaddw.u8 q8,q0,d9
+
+ mov r11,r1
+ vqmovun.s16 d2,q15
+ vqmovun.s16 d3,q14
+ vqmovun.s16 d4,q13
+ vqmovun.s16 d5,q12
+ vqmovun.s16 d6,q11
+ vqmovun.s16 d7,q10
+ vqmovun.s16 d8,q9
+ vqmovun.s16 d9,q8
+
+
+ vst1.u32 {d2},[r11],r3
+ vst1.u32 {d3},[r11],r3
+ vst1.u32 {d4},[r11],r3
+ vst1.u32 {d5},[r11],r3
+ vst1.u32 {d6},[r11],r3
+ vst1.u32 {d7},[r11],r3
+ vst1.u32 {d8},[r11],r3
+ vst1.u32 {d9},[r11]
+
+ add r1,r1,#8
+
+ subs r9,r9,#8
+ bgt col_loop
+
+ subs r8,r8,#8
+
+ add r0,r0,r2,lsl #3
+ add r1,r1,r3,lsl #3
+ sub r0,r0,r4
+ sub r1,r1,r4
+ bgt row_loop
+ b end_loops
+
+
+row_loop_4:
+ mov r9,r10
+
+
+col_loop_4:
+
+
+ vld1.8 d2,[r0],r2
+ vld1.8 d3,[r0],r2
+ vld1.8 d4,[r0],r2
+ vld1.8 d5,[r0]
+
+
+
+
+ vaddw.u8 q15,q0,d2
+ vaddw.u8 q14,q0,d3
+ vaddw.u8 q13,q0,d4
+ vaddw.u8 q12,q0,d5
+
+
+
+ vqmovun.s16 d2,q15
+ vqmovun.s16 d3,q14
+ vqmovun.s16 d4,q13
+ vqmovun.s16 d5,q12
+
+
+
+ vst1.u32 {d2[0]},[r1],r3
+ vst1.u32 {d3[0]},[r1],r3
+ vst1.u32 {d4[0]},[r1],r3
+ vst1.u32 {d5[0]},[r1]
+
+end_loops:
+ pop {r0-r11,pc}
+
+
+
+
+
+
+
+
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s
new file mode 100644
index 0000000..4cc6085
--- /dev/null
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s
@@ -0,0 +1,209 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//* ihevcd_fmt_conv_420sp_to_420p.s
+//*
+//* //brief
+//* contains function definitions for format conversions
+//*
+//* //author
+//* ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************/
+
+.text
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+///*****************************************************************************
+//* *
+//* Function Name : neon_copy_yuv420sp_to_yuv420p() *
+//* *
+//* Description : This function conversts the image from YUV420sP color *
+//* space to 420SP color space(UV interleaved). *
+//* *
+//* Arguments : x0 pu1_src_y *
+//* x1 pu1_src_uv *
+//* x2 pu1_dest_y *
+//* x3 pu1_dest_u *
+//* [x13 #40] pu1_dest_v *
+//* [x13 #44] u2_width *
+//* [x13 #48] u2_height *
+//* [x13 #52] u2_stridey *
+//* [x13 #56] u2_strideuv *
+//* [x13 #60] u2_dest_stridey *
+//* [x13 #64] u2_dest_strideuv *
+//* [x13 #68] is_u_first *
+//* [x13 #72] disable_luma_copy *
+//* *
+//* Values Returned : None *
+//* *
+//* Register Usage : x0 - x14 *
+//* *
+//* Stack Usage : 40 Bytes *
+//* *
+//* Interruptibility : Interruptible *
+//* *
+//* Known Limitations *
+//* Assumptions: Image Width: Assumed to be multiple of 2 and *
+//* Image Height: Assumed to be even. *
+//* *
+//* Revision History : *
+//* DD MM YYYY Author(s) Changes (Describe the changes made) *
+//* 16 05 2012 Naveen SR draft *
+//* *
+//*****************************************************************************/
+
+.globl ihevcd_fmt_conv_420sp_to_420p_av8
+
+.type ihevcd_fmt_conv_420sp_to_420p_av8, %function
+
+ihevcd_fmt_conv_420sp_to_420p_av8:
+ // STMFD sp!,{x4-x12, x14}
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ mov x15, x4
+ mov x8, x5 ////Load u2_width
+ mov x9, x6 ////Load u2_height
+
+ LDR w5, [sp,#88] ////Load u2_dest_stridey
+ sxtw x5,w5
+// LDR x6,[sp,#80] @//Load u2_strideuv
+
+ SUB x10,x7,x8 //// Src Y increment
+ SUB x11,x5,x8 //// Dst Y increment
+
+ LDR w5, [sp,#112] ////Load disable_luma_copy flag
+ sxtw x5,w5
+ CMP x5,#0 ////skip luma if disable_luma_copy is non-zero
+ BNE uv_copy_start
+
+ ///* Copy Y */
+
+ MOV x4,x9 //// Copying height
+y_row_loop:
+ MOV x6,x8 //// Copying width
+
+y_col_loop:
+
+ SUB x6,x6,#16
+ ld1 {v0.8b, v1.8b},[x0],#16
+ st1 {v0.8b, v1.8b},[x2],#16
+ CMP x6,#16
+ BGE y_col_loop
+ CMP x6,#0
+ BEQ y_col_loop_end
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20,x6,#16
+ neg x6, x20
+ SUB x0,x0,x6
+ SUB x2,x2,x6
+ ld1 {v0.8b, v1.8b}, [x0],#16
+ st1 {v0.8b, v1.8b}, [x2],#16
+
+y_col_loop_end:
+ ADD x0, x0, x10
+ ADD x2, x2, x11
+ SUBS x4, x4, #1
+ BGT y_row_loop
+
+
+ ///* Copy UV */
+uv_copy_start:
+
+ LDR w5, [sp,#96] ////Load u2_dest_strideuv
+ sxtw x5,w5
+ LDR w7, [sp,#80] ////Load u2_strideuv
+ sxtw x7,w7
+
+ LSR x9, x9, #1 //// height/2
+// MOV x8,x8,LSR #1 @// Width/2
+
+ SUB x10,x7,x8 //// Src UV increment
+ LSR x11, x8, #1
+ SUB x11,x5,x11 //// Dst U and V increment
+
+ mov x5, x15 ////Load pu1_dest_v
+
+ LDR w4, [sp,#104] ////Load is_u_first_flag
+ sxtw x4,w4
+ CMP x4,#0 ////Swap U and V dest if is_u_first_flag is zero
+ csel x4, x5, x4,EQ
+ csel x5, x3, x5,EQ
+ csel x3, x4, x3,EQ
+
+ MOV x4,x9 //// Copying height
+uv_row_loop:
+ MOV x6,x8 //// Copying width
+
+uv_col_loop:
+
+ SUB x6,x6,#16
+
+ prfm PLDL1KEEP,[x1,#128]
+ ld2 {v0.8b, v1.8b},[x1],#16
+ ST1 {v0.8b},[x3],#8
+ ST1 {v1.8b},[x5],#8
+ CMP x6,#16
+ BGE uv_col_loop
+ CMP x6,#0
+ BEQ uv_col_loop_end
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20,x6,#16
+ neg x6, x20
+ SUB x1,x1,x6
+ SUB x3,x3,x6,LSR #1
+ SUB x5,x5,x6,LSR #1
+ ld2 {v0.8b, v1.8b}, [x1],#16
+ ST1 {v0.8b},[x3],#8
+ ST1 {v1.8b},[x5],#8
+uv_col_loop_end:
+ ADD x1, x1, x10
+ ADD x3, x3, x11
+ ADD x5, x5, x11
+ SUBS x4, x4, #1
+ BGT uv_row_loop
+
+exit:
+ // LDMFD sp!,{x4-x12, pc}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s
new file mode 100644
index 0000000..ccf47a5
--- /dev/null
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s
@@ -0,0 +1,207 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//* ihevcd_fmt_conv_420sp_to_420sp.s
+//*
+//* //brief
+//* contains function definitions for format conversions
+//*
+//* //author
+//* ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************/
+ .equ DO1STROUNDING, 0
+
+ // ARM
+ //
+ // PRESERVE8
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+
+
+
+///*****************************************************************************
+//* *
+//* Function Name : ihevcd_fmt_conv_420sp_to_420sp() *
+//* *
+//* Description : This function conversts the image from YUV420SP color *
+//* space to 420SP color space(UV interleaved). *
+//* *
+//* Arguments : x0 pu1_y *
+//* x1 pu1_uv *
+//* x2 pu1_dest_y *
+//* x3 pu1_dest_uv *
+//* [x13 #40] u2_width *
+//* [x13 #44] u2_height *
+//* [x13 #48] u2_stridey *
+//* [x13 #52] u2_stridechroma *
+//* [x13 #56] u2_dest_stridey *
+//* [x13 #60] u2_dest_stridechroma *
+//* *
+//* Values Returned : None *
+//* *
+//* Register Usage : x0 - x14 *
+//* *
+//* Stack Usage : 40 Bytes *
+//* *
+//* Interruptibility : Interruptible *
+//* *
+//* Known Limitations *
+//* Assumptions: Image Width: Assumed to be multiple of 2 and *
+//* Image Height: Assumed to be even. *
+//* *
+//* Revision History : *
+//* DD MM YYYY Author(s) Changes (Describe the changes made) *
+//* 16 05 2012 Naveen SR draft *
+//* *
+//*****************************************************************************/
+
+ .global ihevcd_fmt_conv_420sp_to_420sp_av8
+.type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
+ihevcd_fmt_conv_420sp_to_420sp_av8:
+
+ // STMFD sp!,{x4-x12, x14}
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ mov x8, x4 ////Load u2_width
+ mov x9, x5 ////Load u2_height
+
+ LDR w5, [sp,#80] ////Load u2_dest_stridey
+ sxtw x5,w5
+
+ mov x7, x6 ////Load u2_stridey
+
+ SUB x10,x7,x8 //// Src Y increment
+ SUB x11,x5,x8 //// Dst Y increment
+
+ ///* Copy Y */
+
+ MOV x4,x9 //// Copying height
+y_row_loop:
+ MOV x6,x8 //// Copying width
+
+y_col_loop:
+ prfm PLDL1KEEP,[x0, #128]
+ SUB x6,x6,#32
+ LD1 {v0.8b},[x0],#8
+ LD1 {v1.8b},[x0],#8
+ LD1 {v2.8b},[x0],#8
+ LD1 {v3.8b},[x0],#8
+ ST1 {v0.8b},[x2],#8
+ ST1 {v1.8b},[x2],#8
+ ST1 {v2.8b},[x2],#8
+ ST1 {v3.8b},[x2],#8
+ CMP x6,#32
+ BGE y_col_loop
+ CMP x6,#0
+ BEQ y_col_loop_end
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20,x6,#32
+ neg x6, x20
+ SUB x0,x0,x6
+ SUB x2,x2,x6
+ LD1 {v0.8b},[x0],#8
+ LD1 {v1.8b},[x0],#8
+ LD1 {v2.8b},[x0],#8
+ LD1 {v3.8b},[x0],#8
+ ST1 {v0.8b},[x2],#8
+ ST1 {v1.8b},[x2],#8
+ ST1 {v2.8b},[x2],#8
+ ST1 {v3.8b},[x2],#8
+
+y_col_loop_end:
+ ADD x0, x0, x10
+ ADD x2, x2, x11
+ SUBS x4, x4, #1
+ BGT y_row_loop
+
+
+
+ ///* Copy UV */
+
+ LDR w5, [sp,#88] ////Load u2_dest_stridechroma
+ sxtw x5,w5
+
+ LSR x9, x9, #1 //// height/2
+// MOV x8,x8,LSR #1 @// Width/2
+
+ MOV x2,x3 //pu1_dest_uv
+
+ SUB x10,x7,x8 //// Src UV increment
+ SUB x11,x5,x8 //// Dst UV increment
+
+ MOV x4,x9 //// Copying height
+uv_row_loop:
+ MOV x6,x8 //// Copying width
+
+uv_col_loop:
+
+ prfm PLDL1KEEP,[x1, #128]
+ SUB x6,x6,#16
+ LD1 {v0.8b},[x1],#8
+ LD1 {v1.8b},[x1],#8
+ ST1 {v0.8b},[x2],#8
+ ST1 {v1.8b},[x2],#8
+ CMP x6,#16
+ BGE uv_col_loop
+ CMP x6,#0
+ BEQ u_col_loop_end
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20,x6,#16
+ neg x6, x20
+ SUB x1,x1,x6
+ SUB x2,x2,x6
+ LD1 {v0.8b},[x1],#8
+ LD1 {v1.8b},[x1],#8
+ ST1 {v0.8b},[x2],#8
+ ST1 {v1.8b},[x2],#8
+
+u_col_loop_end:
+ ADD x1, x1, x10
+ ADD x2, x2, x11
+ SUBS x4, x4, #1
+ BGT uv_row_loop
+
+exit:
+ // LDMFD sp!,{x4-x12, pc}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+ .section .note.GNU-stack,"",%progbits
+
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
new file mode 100644
index 0000000..485ee66
--- /dev/null
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -0,0 +1,523 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//* ihevcd_fmt_conv_420sp_to_rgba8888.s
+//*
+//* //brief
+//* contains function definitions for format conversions
+//*
+//* //author
+//* ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************/
+
+ .equ DO1STROUNDING, 0
+
+ // ARM
+ //
+ // PRESERVE8
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+
+
+///*****************************************************************************
+//* *
+//* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() *
+//* *
+//* Description : This function conversts the image from YUV422 color *
+//* space to RGB888 color space. The function can be *
+//* invoked at the MB level. *
+//* *
+//* Arguments : x0 pubY *
+//* x1 pubUV *
+//* x2 pusRGB *
+//* x3 pusRGB *
+//* [x13 #40] usHeight *
+//* [x13 #44] usWidth *
+//* [x13 #48] usStrideY *
+//* [x13 #52] usStrideU *
+//* [x13 #56] usStrideV *
+//* [x13 #60] usStrideRGB *
+//* *
+//* Values Returned : None *
+//* *
+//* Register Usage : x0 - x14 *
+//* *
+//* Stack Usage : 40 Bytes *
+//* *
+//* Interruptibility : Interruptible *
+//* *
+//* Known Limitations *
+//* Assumptions: Image Width: Assumed to be multiple of 16 and *
+//* greater than or equal to 16 *
+//* Image Height: Assumed to be even. *
+//* *
+//* Revision History : *
+//* DD MM YYYY Author(s) Changes (Describe the changes made) *
+//* 07 06 2010 Varshita Draft *
+//* 07 06 2010 Naveen Kr T Completed *
+//* 05 08 2013 Naveen K P Modified for HEVC *
+//*****************************************************************************/
+ .global ihevcd_fmt_conv_420sp_to_rgba8888_av8
+.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
+ihevcd_fmt_conv_420sp_to_rgba8888_av8:
+
+ //// push the registers on the stack
+ // STMFD sp!,{x4-x12,x14}
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+
+ ////x0 - Y PTR
+ ////x1 - UV PTR
+ ////x2 - RGB PTR
+ ////x3 - RGB PTR
+ ////x4 - PIC WIDTH
+ ////x5 - PIC HT
+ ////x6 - STRIDE Y
+ ////x7 - STRIDE U
+ ////x8 - STRIDE V
+ ////x9 - STRIDE RGB
+
+ ////ONE ROW PROCESSING AT A TIME
+
+ ////THE FOUR CONSTANTS ARE:
+ ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
+
+ //PLD [x0]
+ //PLD [x1]
+ //PLD [x2]
+
+
+ ///* can be loaded from a defined const type */
+ mov x10,#0x3311
+ mov v0.4h[0], w10 ////C1
+
+ mov x10,#0xF379
+ mov v0.4h[1], w10 ////C2
+
+ mov x10,#0xE5F8
+ mov v0.4h[2], w10 ////C3
+
+ mov x10,#0x4092
+ mov v0.4h[3], w10 ////C4
+
+ ////LOAD CONSTANT 128 INTO A CORTEX REGISTER
+ MOV x10,#128
+ dup v1.8b,w10
+
+ ////D0 HAS C1-C2-C3-C4
+ //// load other parameters from stack
+ mov x9, x7
+ mov x7, x6
+ mov x6, x5
+ mov x5, x4
+ //LDR x4,[sp,#44]
+ //LDR x8,[sp,#52]
+
+ //// calculate offsets, offset = stride - width
+ SUB x10,x6,x3 //// luma offset
+ SUB x11,x7,x3
+ //, LSR #1 @// u offset
+ //SUB x12,x8,x3, LSR #1 @// v offset
+ SUB x14,x9,x3 //// rgb offset in pixels
+
+ //// calculate height loop count
+ LSR x5, x5, #1 //// height_cnt = height / 16
+
+ //// create next row pointers for rgb and luma data
+ ADD x7,x0,x6 //// luma_next_row = luma + luma_stride
+ ADD x8,x2,x9,LSL #2 //// rgb_next_row = rgb + rgb_stride
+
+LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
+
+ ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
+ LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF UV
+ ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V
+
+ //// calculate width loop count
+ LSR x6, x3, #4 //// width_cnt = width / 16
+
+ ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+ ////LOAD VALUES OF Y 8-BIT VALUES
+ LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+ LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+ SUBS x6,x6,#1
+ BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
+ //VMOV.I8 Q1,#128
+ UZP1 v27.8b, v2.8b, v3.8b
+ UZP2 v3.8b, v2.8b, v3.8b
+ mov v2.d[0], v27.d[0]
+
+ ////NEED TO SUBTRACT (U-128) AND (V-128)
+ ////(D2-D1),(D3-D1)
+ uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
+ uSUBL v6.8h, v3.8b, v1.8b ////(V-128)
+
+ ////LOAD VALUES OF U&V for next row
+ LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF U
+ ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V
+
+ //PLD [x0]
+ prfm PLDL1KEEP,[x1]
+
+ ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+ sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+
+ sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
+ sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
+
+ sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
+ sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ ////Q4 - WEIGHT FOR B
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ ////Q5 - WEIGHT FOR R
+
+ ////NARROW RIGHT SHIFT BY 13 FOR G
+ sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ ////Q6 - WEIGHT FOR G
+
+ UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x2],#16
+ ST1 {v20.4s},[x2],#16
+ ST1 {v16.4s},[x2],#16
+ ST1 {v22.4s},[x2],#16
+
+ ////D14-D20 - TOALLY HAVE 16 VALUES
+ ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+ UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
+
+ ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+ ////LOAD VALUES OF Y 8-BIT VALUES
+ LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+ LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+ prfm PLDL1KEEP,[x0]
+ prfm PLDL1KEEP,[x7]
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x8],#16
+ ST1 {v20.4s},[x8],#16
+ ST1 {v16.4s},[x8],#16
+ ST1 {v22.4s},[x8],#16
+
+ SUBS x6,x6,#1 //// width_cnt -= 1
+ BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
+ //VMOV.I8 Q1,#128
+ UZP1 v27.8b, v2.8b, v3.8b
+ UZP2 v3.8b, v2.8b, v3.8b
+ mov v2.d[0], v27.d[0]
+
+
+ ////NEED TO SUBTRACT (U-128) AND (V-128)
+ ////(D2-D1),(D3-D1)
+ uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
+ uSUBL v6.8h, v3.8b, v1.8b ////(V-128)
+
+
+ ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+ sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+
+ sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
+ sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
+
+ sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
+ sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ ////Q4 - WEIGHT FOR B
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ ////Q5 - WEIGHT FOR R
+
+ ////NARROW RIGHT SHIFT BY 13 FOR G
+ sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ ////Q6 - WEIGHT FOR G
+
+ UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x2],#16
+ ST1 {v20.4s},[x2],#16
+ ST1 {v16.4s},[x2],#16
+ ST1 {v22.4s},[x2],#16
+
+ ////D14-D20 - TOALLY HAVE 16 VALUES
+ ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+ UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x8],#16
+ ST1 {v20.4s},[x8],#16
+ ST1 {v16.4s},[x8],#16
+ ST1 {v22.4s},[x8],#16
+
+ //// Adjust the address pointers
+ ADD x0,x7,x10 //// luma = luma_next + offset
+ ADD x2,x8,x14,LSL #2 //// rgb = rgb_next + offset
+
+ ADD x7,x0,x3 //// luma_next = luma + width
+ ADD x8,x2,x3,LSL #2 //// rgb_next_row = rgb + width
+
+ ADD x1,x1,x11 //// adjust u pointer
+ //ADD x2,x2,x12 @// adjust v pointer
+
+ ADD x7,x7,x10 //// luma_next = luma + width + offset (because of register crunch)
+ ADD x8,x8,x14,LSL #2 //// rgb_next_row = rgb + width + offset
+
+ SUBS x5,x5,#1 //// height_cnt -= 1
+
+ BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
+
+ ////POP THE REGISTERS
+ // LDMFD sp!,{x4-x12,PC}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+ .section .note.GNU-stack,"",%progbits
+
diff --git a/decoder/arm64/ihevcd_function_selector_av8.c b/decoder/arm64/ihevcd_function_selector_av8.c
new file mode 100644
index 0000000..210c730
--- /dev/null
+++ b/decoder/arm64/ihevcd_function_selector_av8.c
@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector.c
+*
+* @brief
+* Contains functions to initialize a9q function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_av8(codec_t *ps_codec)
+{
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr = &ihevc_deblk_chroma_horz_av8;
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr = &ihevc_deblk_chroma_vert_av8;
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr = &ihevc_deblk_luma_vert_av8;
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr = &ihevc_deblk_luma_horz_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr = &ihevc_inter_pred_chroma_copy_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr = &ihevc_inter_pred_chroma_copy_w16out_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr = &ihevc_inter_pred_chroma_horz_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr = &ihevc_inter_pred_chroma_horz_w16out_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr = &ihevc_inter_pred_chroma_vert_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr = &ihevc_inter_pred_chroma_vert_w16inp_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16inp_w16out_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16out_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr = &ihevc_inter_pred_luma_horz_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr = &ihevc_inter_pred_luma_vert_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr = &ihevc_inter_pred_luma_vert_w16out_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr = &ihevc_inter_pred_luma_vert_w16inp_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr = &ihevc_inter_pred_luma_copy_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr = &ihevc_inter_pred_luma_copy_w16out_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr = &ihevc_inter_pred_luma_horz_w16out_av8;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_luma_vert_w16inp_w16out_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr = &ihevc_intra_pred_chroma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr = &ihevc_intra_pred_luma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr = &ihevc_intra_pred_luma_ref_subst_all_avlble;
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr = &ihevc_intra_pred_ref_filtering_neonintr;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr = &ihevc_intra_pred_chroma_dc_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr = &ihevc_intra_pred_chroma_horz_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr = &ihevc_intra_pred_chroma_mode2_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr = &ihevc_intra_pred_chroma_mode_18_34_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr = &ihevc_intra_pred_chroma_mode_27_to_33_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr = &ihevc_intra_pred_chroma_mode_3_to_9_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr = &ihevc_intra_pred_chroma_planar_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr = &ihevc_intra_pred_chroma_ver_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr = &ihevc_intra_pred_chroma_mode_11_to_17_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr = &ihevc_intra_pred_chroma_mode_19_to_25_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr = &ihevc_intra_pred_luma_mode_11_to_17_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr = &ihevc_intra_pred_luma_mode_19_to_25_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr = &ihevc_intra_pred_luma_dc_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr = &ihevc_intra_pred_luma_horz_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr = &ihevc_intra_pred_luma_mode2_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr = &ihevc_intra_pred_luma_mode_18_34_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr = &ihevc_intra_pred_luma_mode_27_to_33_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr = &ihevc_intra_pred_luma_mode_3_to_9_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr = &ihevc_intra_pred_luma_planar_av8;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr = &ihevc_intra_pred_luma_ver_av8;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr = &ihevc_itrans_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_fptr = &ihevc_itrans_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_8x8_fptr = &ihevc_itrans_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_16x16_fptr = &ihevc_itrans_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_32x32_fptr = &ihevc_itrans_32x32;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr = &ihevc_itrans_recon_4x4_ttype1_av8;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr = &ihevc_itrans_recon_4x4_av8;
+ ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr = &ihevc_itrans_recon_8x8_av8;
+ ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr = &ihevc_itrans_recon_16x16_av8;
+ ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr = &ihevc_itrans_recon_32x32_av8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr = &ihevc_chroma_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr = &ihevc_chroma_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr = &ihevc_chroma_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr = &ihevc_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_recon_4x4_fptr = &ihevc_recon_4x4;
+ ps_codec->s_func_selector.ihevc_recon_8x8_fptr = &ihevc_recon_8x8;
+ ps_codec->s_func_selector.ihevc_recon_16x16_fptr = &ihevc_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_32x32_fptr = &ihevc_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr = &ihevc_chroma_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr = &ihevc_chroma_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr = &ihevc_chroma_recon_16x16;
+ ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr = &ihevc_memcpy_mul_8_av8;
+ ps_codec->s_func_selector.ihevc_memcpy_fptr = &ihevc_memcpy_av8;
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr = &ihevc_memset_mul_8_av8;
+ ps_codec->s_func_selector.ihevc_memset_fptr = &ihevc_memset_av8;
+ ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr = &ihevc_memset_16bit_mul_8_av8;
+ ps_codec->s_func_selector.ihevc_memset_16bit_fptr = &ihevc_memset_16bit_av8;
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr = &ihevc_pad_left_luma_av8;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr = &ihevc_pad_left_chroma_av8;
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr = &ihevc_pad_right_luma_av8;
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr = &ihevc_pad_right_chroma_av8;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr = &ihevc_weighted_pred_bi_av8;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr = &ihevc_weighted_pred_bi_default_av8;
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr = &ihevc_weighted_pred_uni_av8;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr = &ihevc_weighted_pred_chroma_bi_neonintr;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr = &ihevc_weighted_pred_chroma_bi_default_neonintr;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr = &ihevc_weighted_pred_chroma_uni_neonintr;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr = &ihevc_sao_band_offset_luma_av8;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr = &ihevc_sao_band_offset_chroma_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr = &ihevc_sao_edge_offset_class0_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr = &ihevc_sao_edge_offset_class0_chroma_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr = &ihevc_sao_edge_offset_class1_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr = &ihevc_sao_edge_offset_class1_chroma_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr = &ihevc_sao_edge_offset_class2_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr = &ihevc_sao_edge_offset_class2_chroma_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr = &ihevc_sao_edge_offset_class3_av8;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr = &ihevc_sao_edge_offset_class3_chroma_av8;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr = &ihevcd_fmt_conv_420sp_to_rgba8888_av8;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr = &ihevcd_fmt_conv_420sp_to_rgb565;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp_av8;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr = &ihevcd_fmt_conv_420sp_to_420p_av8;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr = &ihevcd_itrans_recon_dc_luma_av8;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr = &ihevcd_itrans_recon_dc_chroma_av8;
+}
diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_chroma.s b/decoder/arm64/ihevcd_itrans_recon_dc_chroma.s
new file mode 100644
index 0000000..9d1e8a4
--- /dev/null
+++ b/decoder/arm64/ihevcd_itrans_recon_dc_chroma.s
@@ -0,0 +1,220 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//* ihevcd_itrans_recon_dc_chroma.s
+//*
+//* //brief
+//* contains function definitions itrans and recon for dc only case
+//*
+//* //author
+//* ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************/
+
+
+.text
+.include "ihevc_neon_macros.s"
+
+
+.globl ihevcd_itrans_recon_dc_chroma_av8
+
+.type ihevcd_itrans_recon_dc_chroma_av8, %function
+
+ihevcd_itrans_recon_dc_chroma_av8:
+
+//void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 log2_trans_size,
+// word16 i2_coeff_value)
+
+//x0:pu1_pred
+//x1:pu1_dest
+//x2:pred_strd
+//x3:dst_strd
+
+
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+ sxth x5, w5 // since the argument is of word16, sign extend to x register
+
+ mov x10,#1
+ lsl x4,x10,x4 // trans_size = (1 << log2_trans_size)//
+ mov x6,#64 // 1 << (shift1 - 1)//
+ mov x7,#2048 // 1<<(shift2-1)
+
+ add x8,x6,x5,lsl #6
+ asr x20, x8, #7
+ mov x19,#32767
+ cmp x20,x19
+ blt lbl36
+ mov x8,#32767
+ b lbl36_1
+lbl36:
+ mov x19,#-32768
+ cmp x20,x19
+ csel x8, x19, x20, lt
+lbl36_1:
+
+ add x5,x7,x8,lsl #6
+ asr x20, x5, #12
+ mov x19,#32767
+ cmp x20,x19
+ blt lbl38
+ mov x6,#32767
+ b lbl38_1
+lbl38:
+ mov x19,#-32768
+ cmp x20,x19
+ csel x6, x19, x20, lt
+lbl38_1:
+
+ mov x9,x4
+ mov x8,x4
+
+ // x6 has the dc_value
+ // x4 has the trans_size value
+ // x8 has the row value
+ // x9 has the col value
+ dup v0.8h,w6
+ cmp x4,#4
+ beq row_loop_4chroma
+
+
+row_loop_chroma:
+ mov x9,x4
+
+
+col_loop_chroma:
+
+ mov x7,x0
+ ld2 {v2.8b, v3.8b},[x7],x2
+ ld2 {v4.8b, v5.8b},[x7],x2
+ ld2 {v6.8b, v7.8b},[x7],x2
+ ld2 {v8.8b, v9.8b},[x7],x2
+
+ ld2 {v10.8b, v11.8b},[x7],x2
+ ld2 {v12.8b, v13.8b},[x7],x2
+ ld2 {v14.8b, v15.8b},[x7],x2
+ ld2 {v16.8b, v17.8b},[x7]
+
+ add x0,x0,#16
+
+
+ uaddw v30.8h, v0.8h , v2.8b
+ uaddw v28.8h, v0.8h , v4.8b
+ uaddw v26.8h, v0.8h , v6.8b
+ uaddw v24.8h, v0.8h , v8.8b
+ uaddw v22.8h, v0.8h , v10.8b
+ uaddw v20.8h, v0.8h , v12.8b
+ uaddw v18.8h, v0.8h , v14.8b
+
+
+ mov x11,x1
+ sqxtun v2.8b, v30.8h
+ sqxtun v4.8b, v28.8h
+ sqxtun v6.8b, v26.8h
+ sqxtun v8.8b, v24.8h
+
+ uaddw v30.8h, v0.8h , v16.8b
+
+ sqxtun v10.8b, v22.8h
+ sqxtun v12.8b, v20.8h
+ sqxtun v14.8b, v18.8h
+ sqxtun v16.8b, v30.8h
+
+ st2 {v2.8b, v3.8b},[x11],x3
+ st2 {v4.8b, v5.8b},[x11],x3
+ st2 {v6.8b, v7.8b},[x11],x3
+ st2 {v8.8b, v9.8b},[x11],x3
+
+ st2 {v10.8b, v11.8b},[x11],x3
+ st2 {v12.8b, v13.8b},[x11],x3
+ st2 {v14.8b, v15.8b},[x11],x3
+ st2 {v16.8b, v17.8b},[x11]
+
+ add x1,x1,#16
+
+ subs x9,x9,#8
+ bgt col_loop_chroma
+
+ subs x8,x8,#8
+
+ add x0,x0,x2,lsl #3
+ add x1,x1,x3,lsl #3
+ sub x0,x0,x4,lsl #1
+ sub x1,x1,x4,lsl #1
+ bgt row_loop_chroma
+ b end_loops_chroma
+
+
+row_loop_4chroma:
+ mov x9,x10
+
+
+col_loop_4chroma:
+
+
+ ld2 {v2.8b, v3.8b},[x0],x2
+ ld2 {v4.8b, v5.8b},[x0],x2
+ ld2 {v6.8b, v7.8b},[x0],x2
+ ld2 {v8.8b, v9.8b},[x0]
+
+
+
+
+ uaddw v30.8h, v0.8h , v2.8b
+ uaddw v28.8h, v0.8h , v4.8b
+ uaddw v26.8h, v0.8h , v6.8b
+ uaddw v24.8h, v0.8h , v8.8b
+
+
+
+ sqxtun v31.8b, v30.8h
+ sqxtun v29.8b, v28.8h
+ sqxtun v27.8b, v26.8h
+ sqxtun v25.8b, v24.8h
+
+
+ zip1 v2.8b, v31.8b, v3.8b
+ zip1 v4.8b, v29.8b, v5.8b
+ zip1 v6.8b, v27.8b, v7.8b
+ zip1 v8.8b, v25.8b, v9.8b
+
+ st1 {v2.2s},[x1],x3
+ st1 {v4.2s},[x1],x3
+ st1 {v6.2s},[x1],x3
+ st1 {v8.2s},[x1]
+
+end_loops_chroma:
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
diff --git a/decoder/arm64/ihevcd_itrans_recon_dc_luma.s b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
new file mode 100644
index 0000000..279888b
--- /dev/null
+++ b/decoder/arm64/ihevcd_itrans_recon_dc_luma.s
@@ -0,0 +1,218 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//* ihevcd_itrans_recon_dc_luma.s
+//*
+//* //brief
+//* contains function definitions itrans and recon for dc only case
+//*
+//* //author
+//* ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************/
+
+.text
+.include "ihevc_neon_macros.s"
+
+
+
+.globl ihevcd_itrans_recon_dc_luma_av8
+
+.type ihevcd_itrans_recon_dc_luma_av8, %function
+
+ihevcd_itrans_recon_dc_luma_av8:
+
+//void ihevcd_itrans_recon_dc_luma(uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 log2_trans_size,
+// word16 i2_coeff_value)
+
+//x0:pu1_pred
+//x1:pu1_dest
+//x2:pred_strd
+//x3:dst_strd
+
+
+
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+ sxth x5,w5
+
+ mov x10,#1
+ lsl x4,x10,x4 // trans_size = (1 << log2_trans_size)//
+ mov x6,#64 // 1 << (shift1 - 1)//
+ mov x7,#2048 // 1<<(shift2-1)
+
+ add x8,x6,x5,lsl #6
+ asr x20, x8, #7
+ mov x19, #32767
+ cmp x20,x19
+ blt lbl37
+ mov x8,#32767
+ b lbl37_1
+lbl37:
+ mov x19,#-32768
+ cmp x20,x19
+ csel x8, x19, x20, lt
+lbl37_1:
+
+ add x5,x7,x8,lsl #6
+ asr x20, x5, #12
+ mov x19,#32767
+ cmp x20,x19
+ blt lbl39
+ mov x6,#32767
+ b lbl39_1
+lbl39:
+ mov x19,#-32768
+ cmp x20,x19
+ csel x6, x19, x20, lt
+lbl39_1:
+
+ mov x9,x4
+ mov x8,x4
+
+ // x6 has the dc_value
+ // x4 has the trans_size value
+ // x8 has the row value
+ // x9 has the col value
+ dup v0.8h,w6
+ cmp x4,#4
+ beq row_loop_4
+
+
+row_loop:
+ mov x9,x4
+
+
+col_loop:
+
+ mov x7,x0
+ ld1 {v2.8b},[x7],x2
+ ld1 {v3.8b},[x7],x2
+ ld1 {v4.8b},[x7],x2
+ ld1 {v5.8b},[x7],x2
+
+ ld1 {v6.8b},[x7],x2
+ ld1 {v7.8b},[x7],x2
+ ld1 {v8.8b},[x7],x2
+ ld1 {v9.8b},[x7]
+
+ add x0,x0,#8
+
+
+ uaddw v30.8h, v0.8h , v2.8b
+ uaddw v28.8h, v0.8h , v3.8b
+ uaddw v26.8h, v0.8h , v4.8b
+ uaddw v24.8h, v0.8h , v5.8b
+ uaddw v22.8h, v0.8h , v6.8b
+ uaddw v20.8h, v0.8h , v7.8b
+ uaddw v18.8h, v0.8h , v8.8b
+ uaddw v16.8h, v0.8h , v9.8b
+
+ mov x11,x1
+ sqxtun v2.8b, v30.8h
+ sqxtun v3.8b, v28.8h
+ sqxtun v4.8b, v26.8h
+ sqxtun v5.8b, v24.8h
+ sqxtun v6.8b, v22.8h
+ sqxtun v7.8b, v20.8h
+ sqxtun v8.8b, v18.8h
+ sqxtun v9.8b, v16.8h
+
+
+ st1 {v2.2s},[x11],x3
+ st1 {v3.2s},[x11],x3
+ st1 {v4.2s},[x11],x3
+ st1 {v5.2s},[x11],x3
+ st1 {v6.2s},[x11],x3
+ st1 {v7.2s},[x11],x3
+ st1 {v8.2s},[x11],x3
+ st1 {v9.2s},[x11]
+
+ add x1,x1,#8
+
+ subs x9,x9,#8
+ bgt col_loop
+
+ subs x8,x8,#8
+
+ add x0,x0,x2,lsl #3
+ add x1,x1,x3,lsl #3
+ sub x0,x0,x4
+ sub x1,x1,x4
+ bgt row_loop
+ b end_loops
+
+
+row_loop_4:
+ mov x9,x10
+
+
+col_loop_4:
+
+
+ ld1 {v2.8b},[x0],x2
+ ld1 {v3.8b},[x0],x2
+ ld1 {v4.8b},[x0],x2
+ ld1 {v5.8b},[x0]
+
+
+
+
+ uaddw v30.8h, v0.8h , v2.8b
+ uaddw v28.8h, v0.8h , v3.8b
+ uaddw v26.8h, v0.8h , v4.8b
+ uaddw v24.8h, v0.8h , v5.8b
+
+
+
+ sqxtun v2.8b, v30.8h
+ sqxtun v3.8b, v28.8h
+ sqxtun v4.8b, v26.8h
+ sqxtun v5.8b, v24.8h
+
+
+
+ st1 {v2.s}[0],[x1],x3
+ st1 {v3.s}[0],[x1],x3
+ st1 {v4.s}[0],[x1],x3
+ st1 {v5.s}[0],[x1]
+
+end_loops:
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+
diff --git a/decoder/ihevcd_api.c b/decoder/ihevcd_api.c
new file mode 100644
index 0000000..c55c558
--- /dev/null
+++ b/decoder/ihevcd_api.c
@@ -0,0 +1,4753 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_api.c
+*
+* @brief
+* Contains api functions definitions for HEVC decoder
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+* - api_check_struct_sanity()
+* - ihevcd_get_version()
+* - ihevcd_set_default_params()
+* - ihevcd_init()
+* - ihevcd_get_num_rec()
+* - ihevcd_fill_num_mem_rec()
+* - ihevcd_init_mem_rec()
+* - ihevcd_retrieve_memrec()
+* - ihevcd_set_display_frame()
+* - ihevcd_set_flush_mode()
+* - ihevcd_get_status()
+* - ihevcd_get_buf_info()
+* - ihevcd_set_params()
+* - ihevcd_reset()
+* - ihevcd_rel_display_frame()
+* - ihevcd_disable_deblk()
+* - ihevcd_get_frame_dimensions()
+* - ihevcd_set_num_cores()
+* - ihevcd_ctl()
+* - ihevcd_cxa_api_function()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_trace.h"
+
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_decode.h"
+#include "ihevcd_job_queue.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#include "ihevcd_statistics.h"
+
+/*****************************************************************************/
+/* Function Prototypes */
+/*****************************************************************************/
+IV_API_CALL_STATUS_T ihevcd_get_version(CHAR *pc_version_string,
+ UWORD32 u4_version_buffer_size);
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Used to test arguments for corresponding API call
+*
+* @par Description:
+* For each command the arguments are validated
+*
+* @param[in] ps_handle
+* Codec handle at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input structure
+*
+* @param[out] pv_api_op
+* Pointer to output structure
+*
+* @returns Status of error checking
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+static IV_API_CALL_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ IVD_API_COMMAND_TYPE_T e_cmd;
+ UWORD32 *pu4_api_ip;
+ UWORD32 *pu4_api_op;
+ WORD32 i, j;
+
+ if(NULL == pv_api_op)
+ return (IV_FAIL);
+
+ if(NULL == pv_api_ip)
+ return (IV_FAIL);
+
+ pu4_api_ip = (UWORD32 *)pv_api_ip;
+ pu4_api_op = (UWORD32 *)pv_api_op;
+ e_cmd = (IVD_API_COMMAND_TYPE_T)*(pu4_api_ip + 1);
+
+ *(pu4_api_op + 1) = 0;
+ /* error checks on handle */
+ switch((WORD32)e_cmd)
+ {
+ case IV_CMD_GET_NUM_MEM_REC:
+ case IV_CMD_FILL_NUM_MEM_REC:
+ break;
+ case IV_CMD_INIT:
+ if(ps_handle == NULL)
+ {
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_HANDLE_NULL;
+ return IV_FAIL;
+ }
+
+ if(ps_handle->u4_size != sizeof(iv_obj_t))
+ {
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_HANDLE_STRUCT_SIZE_INCORRECT;
+ DEBUG("Sizes do not match. Expected: %d, Got: %d",
+ sizeof(iv_obj_t), ps_handle->u4_size);
+ return IV_FAIL;
+ }
+ break;
+ case IVD_CMD_REL_DISPLAY_FRAME:
+ case IVD_CMD_SET_DISPLAY_FRAME:
+ case IVD_CMD_GET_DISPLAY_FRAME:
+ case IVD_CMD_VIDEO_DECODE:
+ case IV_CMD_RETRIEVE_MEMREC:
+ case IVD_CMD_VIDEO_CTL:
+ if(ps_handle == NULL)
+ {
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_HANDLE_NULL;
+ return IV_FAIL;
+ }
+
+ if(ps_handle->u4_size != sizeof(iv_obj_t))
+ {
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_HANDLE_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+#if 0
+ if(ps_handle->pv_fxns != ihevcd_cxa_api_function)
+ {
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_INVALID_HANDLE_NULL;
+ return IV_FAIL;
+ }
+#endif
+
+ if(ps_handle->pv_codec_handle == NULL)
+ {
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_INVALID_HANDLE_NULL;
+ return IV_FAIL;
+ }
+ break;
+ default:
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_INVALID_API_CMD;
+ return IV_FAIL;
+ }
+
+ switch((WORD32)e_cmd)
+ {
+ case IV_CMD_GET_NUM_MEM_REC:
+ {
+ ihevcd_cxa_num_mem_rec_ip_t *ps_ip =
+ (ihevcd_cxa_num_mem_rec_ip_t *)pv_api_ip;
+ ihevcd_cxa_num_mem_rec_op_t *ps_op =
+ (ihevcd_cxa_num_mem_rec_op_t *)pv_api_op;
+ ps_op->s_ivd_num_mem_rec_op_t.u4_error_code = 0;
+
+ if(ps_ip->s_ivd_num_mem_rec_ip_t.u4_size
+ != sizeof(ihevcd_cxa_num_mem_rec_ip_t))
+ {
+ ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if(ps_op->s_ivd_num_mem_rec_op_t.u4_size
+ != sizeof(ihevcd_cxa_num_mem_rec_op_t))
+ {
+ ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_num_mem_rec_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+ }
+ break;
+ case IV_CMD_FILL_NUM_MEM_REC:
+ {
+ ihevcd_cxa_fill_mem_rec_ip_t *ps_ip =
+ (ihevcd_cxa_fill_mem_rec_ip_t *)pv_api_ip;
+ ihevcd_cxa_fill_mem_rec_op_t *ps_op =
+ (ihevcd_cxa_fill_mem_rec_op_t *)pv_api_op;
+ iv_mem_rec_t *ps_mem_rec;
+ WORD32 max_wd = ps_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd;
+ WORD32 max_ht = ps_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht;
+
+ max_wd = ALIGN64(max_wd);
+ max_ht = ALIGN64(max_ht);
+
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code = 0;
+
+ if((ps_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ > sizeof(ihevcd_cxa_fill_mem_rec_ip_t))
+ || (ps_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ < sizeof(iv_fill_mem_rec_ip_t)))
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if((ps_op->s_ivd_fill_mem_rec_op_t.u4_size
+ != sizeof(ihevcd_cxa_fill_mem_rec_op_t))
+ && (ps_op->s_ivd_fill_mem_rec_op_t.u4_size
+ != sizeof(iv_fill_mem_rec_op_t)))
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if(max_wd < MIN_WD)
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_REQUESTED_WIDTH_NOT_SUPPPORTED;
+ return (IV_FAIL);
+ }
+
+ if(max_wd > MAX_WD)
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_REQUESTED_WIDTH_NOT_SUPPPORTED;
+ return (IV_FAIL);
+ }
+
+ if(max_ht < MIN_HT)
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_REQUESTED_HEIGHT_NOT_SUPPPORTED;
+ return (IV_FAIL);
+ }
+
+ if((max_ht * max_wd) > (MAX_HT * MAX_WD))
+
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_REQUESTED_HEIGHT_NOT_SUPPPORTED;
+ return (IV_FAIL);
+ }
+
+ if(NULL == ps_ip->s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location)
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_NUM_REC_NOT_SUFFICIENT;
+ return (IV_FAIL);
+ }
+
+ /* check memrecords sizes are correct */
+ ps_mem_rec = ps_ip->s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location;
+ for(i = 0; i < MEM_REC_CNT; i++)
+ {
+ if(ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t))
+ {
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IVD_MEM_REC_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ }
+ break;
+
+ case IV_CMD_INIT:
+ {
+ ihevcd_cxa_init_ip_t *ps_ip = (ihevcd_cxa_init_ip_t *)pv_api_ip;
+ ihevcd_cxa_init_op_t *ps_op = (ihevcd_cxa_init_op_t *)pv_api_op;
+ iv_mem_rec_t *ps_mem_rec;
+ WORD32 max_wd = ps_ip->s_ivd_init_ip_t.u4_frm_max_wd;
+ WORD32 max_ht = ps_ip->s_ivd_init_ip_t.u4_frm_max_ht;
+
+ max_wd = ALIGN64(max_wd);
+ max_ht = ALIGN64(max_ht);
+
+ ps_op->s_ivd_init_op_t.u4_error_code = 0;
+
+ if((ps_ip->s_ivd_init_ip_t.u4_size > sizeof(ihevcd_cxa_init_ip_t))
+ || (ps_ip->s_ivd_init_ip_t.u4_size
+ < sizeof(ivd_init_ip_t)))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if((ps_op->s_ivd_init_op_t.u4_size != sizeof(ihevcd_cxa_init_op_t))
+ && (ps_op->s_ivd_init_op_t.u4_size
+ != sizeof(ivd_init_op_t)))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if(ps_ip->s_ivd_init_ip_t.u4_num_mem_rec != MEM_REC_CNT)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_NOT_SUFFICIENT;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if(max_wd < MIN_WD)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_WIDTH_NOT_SUPPPORTED;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if(max_wd > MAX_WD)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_WIDTH_NOT_SUPPPORTED;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if(max_ht < MIN_HT)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_HEIGHT_NOT_SUPPPORTED;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if((max_ht * max_wd) > (MAX_HT * MAX_WD))
+
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_HEIGHT_NOT_SUPPPORTED;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if(NULL == ps_ip->s_ivd_init_ip_t.pv_mem_rec_location)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_NUM_REC_NOT_SUFFICIENT;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ if((ps_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420P)
+ && (ps_ip->s_ivd_init_ip_t.e_output_format
+ != IV_YUV_422ILE)
+ && (ps_ip->s_ivd_init_ip_t.e_output_format
+ != IV_RGB_565)
+ && (ps_ip->s_ivd_init_ip_t.e_output_format
+ != IV_RGBA_8888)
+ && (ps_ip->s_ivd_init_ip_t.e_output_format
+ != IV_YUV_420SP_UV)
+ && (ps_ip->s_ivd_init_ip_t.e_output_format
+ != IV_YUV_420SP_VU))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_COL_FMT_NOT_SUPPORTED;
+ DEBUG("\n");
+ return (IV_FAIL);
+ }
+
+ /* verify number of mem records */
+ if(ps_ip->s_ivd_init_ip_t.u4_num_mem_rec < MEM_REC_CNT)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_NOT_SUFFICIENT;
+ DEBUG("\n");
+ return IV_FAIL;
+ }
+
+ ps_mem_rec = ps_ip->s_ivd_init_ip_t.pv_mem_rec_location;
+ /* check memrecords sizes are correct */
+ for(i = 0; i < (WORD32)ps_ip->s_ivd_init_ip_t.u4_num_mem_rec; i++)
+ {
+ if(ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_MEM_REC_STRUCT_SIZE_INCORRECT;
+ DEBUG("i: %d\n", i);
+ return IV_FAIL;
+ }
+ /* check memrecords pointers are not NULL */
+
+ if(ps_mem_rec[i].pv_base == NULL)
+ {
+
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_BASE_NULL;
+ DEBUG("i: %d\n", i);
+ return IV_FAIL;
+
+ }
+
+ }
+
+ /* verify memtabs for overlapping regions */
+ {
+ void *start[MEM_REC_CNT];
+ void *end[MEM_REC_CNT];
+
+ start[0] = (ps_mem_rec[0].pv_base);
+ end[0] = (UWORD8 *)(ps_mem_rec[0].pv_base)
+ + ps_mem_rec[0].u4_mem_size - 1;
+ for(i = 1; i < MEM_REC_CNT; i++)
+ {
+ /* This array is populated to check memtab overlapp */
+ start[i] = (ps_mem_rec[i].pv_base);
+ end[i] = (UWORD8 *)(ps_mem_rec[i].pv_base)
+ + ps_mem_rec[i].u4_mem_size - 1;
+
+ for(j = 0; j < i; j++)
+ {
+ if((start[i] >= start[j]) && (start[i] <= end[j]))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_OVERLAP_ERR;
+ DEBUG("i: %d, j: %d\n", i, j);
+ return IV_FAIL;
+ }
+
+ if((end[i] >= start[j]) && (end[i] <= end[j]))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_OVERLAP_ERR;
+ DEBUG("i: %d, j: %d\n", i, j);
+ return IV_FAIL;
+ }
+
+ if((start[i] < start[j]) && (end[i] > end[j]))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_OVERLAP_ERR;
+ DEBUG("i: %d, j: %d\n", i, j);
+ return IV_FAIL;
+ }
+ }
+
+ }
+ }
+
+ {
+ iv_mem_rec_t mem_rec_ittiam_api[MEM_REC_CNT];
+ ihevcd_cxa_fill_mem_rec_ip_t s_fill_mem_rec_ip;
+ ihevcd_cxa_fill_mem_rec_op_t s_fill_mem_rec_op;
+ IV_API_CALL_STATUS_T e_status;
+
+ WORD32 i;
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.e_cmd =
+ IV_CMD_FILL_NUM_MEM_REC;
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location =
+ mem_rec_ittiam_api;
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd =
+ max_wd;
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht =
+ max_ht;
+
+ if(ps_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t, i4_level))
+ {
+ s_fill_mem_rec_ip.i4_level = ps_ip->i4_level;
+ }
+ else
+ {
+ s_fill_mem_rec_ip.i4_level = IHEVC_LEVEL_31;
+ }
+
+ if(ps_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t,
+ u4_num_ref_frames))
+ {
+ s_fill_mem_rec_ip.u4_num_ref_frames =
+ ps_ip->u4_num_ref_frames;
+ }
+ else
+ {
+ s_fill_mem_rec_ip.u4_num_ref_frames = (MAX_REF_CNT + 1);
+ }
+
+ if(ps_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t,
+ u4_num_reorder_frames))
+ {
+ s_fill_mem_rec_ip.u4_num_reorder_frames =
+ ps_ip->u4_num_reorder_frames;
+ }
+ else
+ {
+ s_fill_mem_rec_ip.u4_num_reorder_frames = (MAX_REF_CNT + 1);
+ }
+
+ if(ps_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t,
+ u4_num_extra_disp_buf))
+ {
+ s_fill_mem_rec_ip.u4_num_extra_disp_buf =
+ ps_ip->u4_num_extra_disp_buf;
+ }
+ else
+ {
+ s_fill_mem_rec_ip.u4_num_extra_disp_buf = 0;
+ }
+
+ if(ps_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t,
+ u4_share_disp_buf))
+ {
+#ifndef LOGO_EN
+ s_fill_mem_rec_ip.u4_share_disp_buf =
+ ps_ip->u4_share_disp_buf;
+#else
+ s_fill_mem_rec_ip.u4_share_disp_buf = 0;
+#endif
+ }
+ else
+ {
+ s_fill_mem_rec_ip.u4_share_disp_buf = 0;
+ }
+
+ s_fill_mem_rec_ip.e_output_format =
+ ps_ip->s_ivd_init_ip_t.e_output_format;
+
+ if((s_fill_mem_rec_ip.e_output_format != IV_YUV_420P)
+ && (s_fill_mem_rec_ip.e_output_format
+ != IV_YUV_420SP_UV)
+ && (s_fill_mem_rec_ip.e_output_format
+ != IV_YUV_420SP_VU))
+ {
+ s_fill_mem_rec_ip.u4_share_disp_buf = 0;
+ }
+
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_size =
+ sizeof(ihevcd_cxa_fill_mem_rec_ip_t);
+ s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_size =
+ sizeof(ihevcd_cxa_fill_mem_rec_op_t);
+
+ for(i = 0; i < MEM_REC_CNT; i++)
+ mem_rec_ittiam_api[i].u4_size = sizeof(iv_mem_rec_t);
+
+ e_status = ihevcd_cxa_api_function(NULL,
+ (void *)&s_fill_mem_rec_ip,
+ (void *)&s_fill_mem_rec_op);
+ if(IV_FAIL == e_status)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code =
+ s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_error_code;
+ DEBUG("Fail\n");
+ return (IV_FAIL);
+ }
+
+ for(i = 0; i < MEM_REC_CNT; i++)
+ {
+#ifdef ARMRVDS
+ if((UWORD32)(ps_mem_rec[i].pv_base) & (mem_rec_ittiam_api[i].u4_mem_alignment - 1))
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |= IVD_INIT_DEC_MEM_REC_ALIGNMENT_ERR;
+ DEBUG("Fail\n");
+ return IV_FAIL;
+ }
+#endif
+
+ if(ps_mem_rec[i].u4_mem_size
+ < mem_rec_ittiam_api[i].u4_mem_size)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_INSUFFICIENT_SIZE;
+ DEBUG("i: %d \n", i);
+ return IV_FAIL;
+ }
+ if(ps_mem_rec[i].u4_mem_alignment
+ != mem_rec_ittiam_api[i].u4_mem_alignment)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_ALIGNMENT_ERR;
+ DEBUG("i: %d \n", i);
+ return IV_FAIL;
+ }
+ if(ps_mem_rec[i].e_mem_type
+ != mem_rec_ittiam_api[i].e_mem_type)
+ {
+ UWORD32 check = IV_SUCCESS;
+ UWORD32 diff = mem_rec_ittiam_api[i].e_mem_type
+ - ps_mem_rec[i].e_mem_type;
+
+ if((ps_mem_rec[i].e_mem_type
+ <= IV_EXTERNAL_CACHEABLE_SCRATCH_MEM)
+ && (mem_rec_ittiam_api[i].e_mem_type
+ >= IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM))
+ {
+ check = IV_FAIL;
+ }
+ if(3 != (mem_rec_ittiam_api[i].e_mem_type % 4))
+ {
+ /*
+ * It is not IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM or IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM
+ */
+ if((diff < 1) || (diff > 3))
+ {
+ // Difference between 1 and 3 is okay for all cases other than the two filtered
+ // with the MOD condition above
+ check = IV_FAIL;
+ }
+ }
+ else
+ {
+ if(diff == 1)
+ {
+ /*
+ * This particular case is when codec asked for External Persistent, but got
+ * Internal Scratch.
+ */
+ check = IV_FAIL;
+ }
+ if((diff != 2) && (diff != 3))
+ {
+ check = IV_FAIL;
+ }
+ }
+ if(check == IV_FAIL)
+ {
+ ps_op->s_ivd_init_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_init_op_t.u4_error_code |=
+ IVD_INIT_DEC_MEM_REC_INCORRECT_TYPE;
+ DEBUG("i: %d \n", i);
+ return IV_FAIL;
+ }
+ }
+ }
+ }
+
+ }
+ break;
+
+ case IVD_CMD_GET_DISPLAY_FRAME:
+ {
+ ihevcd_cxa_get_display_frame_ip_t *ps_ip =
+ (ihevcd_cxa_get_display_frame_ip_t *)pv_api_ip;
+ ihevcd_cxa_get_display_frame_op_t *ps_op =
+ (ihevcd_cxa_get_display_frame_op_t *)pv_api_op;
+
+ ps_op->s_ivd_get_display_frame_op_t.u4_error_code = 0;
+
+ if((ps_ip->s_ivd_get_display_frame_ip_t.u4_size
+ != sizeof(ihevcd_cxa_get_display_frame_ip_t))
+ && (ps_ip->s_ivd_get_display_frame_ip_t.u4_size
+ != sizeof(ivd_get_display_frame_ip_t)))
+ {
+ ps_op->s_ivd_get_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_get_display_frame_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if((ps_op->s_ivd_get_display_frame_op_t.u4_size
+ != sizeof(ihevcd_cxa_get_display_frame_op_t))
+ && (ps_op->s_ivd_get_display_frame_op_t.u4_size
+ != sizeof(ivd_get_display_frame_op_t)))
+ {
+ ps_op->s_ivd_get_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_get_display_frame_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ }
+ break;
+
+ case IVD_CMD_REL_DISPLAY_FRAME:
+ {
+ ihevcd_cxa_rel_display_frame_ip_t *ps_ip =
+ (ihevcd_cxa_rel_display_frame_ip_t *)pv_api_ip;
+ ihevcd_cxa_rel_display_frame_op_t *ps_op =
+ (ihevcd_cxa_rel_display_frame_op_t *)pv_api_op;
+
+ ps_op->s_ivd_rel_display_frame_op_t.u4_error_code = 0;
+
+ if((ps_ip->s_ivd_rel_display_frame_ip_t.u4_size
+ != sizeof(ihevcd_cxa_rel_display_frame_ip_t))
+ && (ps_ip->s_ivd_rel_display_frame_ip_t.u4_size
+ != sizeof(ivd_rel_display_frame_ip_t)))
+ {
+ ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if((ps_op->s_ivd_rel_display_frame_op_t.u4_size
+ != sizeof(ihevcd_cxa_rel_display_frame_op_t))
+ && (ps_op->s_ivd_rel_display_frame_op_t.u4_size
+ != sizeof(ivd_rel_display_frame_op_t)))
+ {
+ ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_rel_display_frame_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ }
+ break;
+
+ case IVD_CMD_SET_DISPLAY_FRAME:
+ {
+ ihevcd_cxa_set_display_frame_ip_t *ps_ip =
+ (ihevcd_cxa_set_display_frame_ip_t *)pv_api_ip;
+ ihevcd_cxa_set_display_frame_op_t *ps_op =
+ (ihevcd_cxa_set_display_frame_op_t *)pv_api_op;
+ UWORD32 j;
+
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code = 0;
+
+ if((ps_ip->s_ivd_set_display_frame_ip_t.u4_size
+ != sizeof(ihevcd_cxa_set_display_frame_ip_t))
+ && (ps_ip->s_ivd_set_display_frame_ip_t.u4_size
+ != sizeof(ivd_set_display_frame_ip_t)))
+ {
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if((ps_op->s_ivd_set_display_frame_op_t.u4_size
+ != sizeof(ihevcd_cxa_set_display_frame_op_t))
+ && (ps_op->s_ivd_set_display_frame_op_t.u4_size
+ != sizeof(ivd_set_display_frame_op_t)))
+ {
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if(ps_ip->s_ivd_set_display_frame_ip_t.num_disp_bufs == 0)
+ {
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+ IVD_DISP_FRM_ZERO_OP_BUFS;
+ return IV_FAIL;
+ }
+
+ for(j = 0; j < ps_ip->s_ivd_set_display_frame_ip_t.num_disp_bufs;
+ j++)
+ {
+ if(ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].u4_num_bufs
+ == 0)
+ {
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+ IVD_DISP_FRM_ZERO_OP_BUFS;
+ return IV_FAIL;
+ }
+
+ for(i = 0;
+ i
+ < (WORD32)ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].u4_num_bufs;
+ i++)
+ {
+ if(ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].pu1_bufs[i]
+ == NULL)
+ {
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+ IVD_DISP_FRM_OP_BUF_NULL;
+ return IV_FAIL;
+ }
+
+ if(ps_ip->s_ivd_set_display_frame_ip_t.s_disp_buffer[j].u4_min_out_buf_size[i]
+ == 0)
+ {
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_set_display_frame_op_t.u4_error_code |=
+ IVD_DISP_FRM_ZERO_OP_BUF_SIZE;
+ return IV_FAIL;
+ }
+ }
+ }
+ }
+ break;
+
+ case IVD_CMD_VIDEO_DECODE:
+ {
+ ihevcd_cxa_video_decode_ip_t *ps_ip =
+ (ihevcd_cxa_video_decode_ip_t *)pv_api_ip;
+ ihevcd_cxa_video_decode_op_t *ps_op =
+ (ihevcd_cxa_video_decode_op_t *)pv_api_op;
+
+ DEBUG("The input bytes is: %d",
+ ps_ip->s_ivd_video_decode_ip_t.u4_num_Bytes);
+ ps_op->s_ivd_video_decode_op_t.u4_error_code = 0;
+
+ if(ps_ip->s_ivd_video_decode_ip_t.u4_size
+ != sizeof(ihevcd_cxa_video_decode_ip_t)
+ && ps_ip->s_ivd_video_decode_ip_t.u4_size
+ != offsetof(ivd_video_decode_ip_t,
+ s_out_buffer))
+ {
+ ps_op->s_ivd_video_decode_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_video_decode_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if(ps_op->s_ivd_video_decode_op_t.u4_size
+ != sizeof(ihevcd_cxa_video_decode_op_t)
+ && ps_op->s_ivd_video_decode_op_t.u4_size
+ != offsetof(ivd_video_decode_op_t,
+ u4_output_present))
+ {
+ ps_op->s_ivd_video_decode_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_video_decode_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ }
+ break;
+
+ case IV_CMD_RETRIEVE_MEMREC:
+ {
+ ihevcd_cxa_retrieve_mem_rec_ip_t *ps_ip =
+ (ihevcd_cxa_retrieve_mem_rec_ip_t *)pv_api_ip;
+ ihevcd_cxa_retrieve_mem_rec_op_t *ps_op =
+ (ihevcd_cxa_retrieve_mem_rec_op_t *)pv_api_op;
+ iv_mem_rec_t *ps_mem_rec;
+
+ ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code = 0;
+
+ if(ps_ip->s_ivd_retrieve_mem_rec_ip_t.u4_size
+ != sizeof(ihevcd_cxa_retrieve_mem_rec_ip_t))
+ {
+ ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ if(ps_op->s_ivd_retrieve_mem_rec_op_t.u4_size
+ != sizeof(ihevcd_cxa_retrieve_mem_rec_op_t))
+ {
+ ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return (IV_FAIL);
+ }
+
+ ps_mem_rec = ps_ip->s_ivd_retrieve_mem_rec_ip_t.pv_mem_rec_location;
+ /* check memrecords sizes are correct */
+ for(i = 0; i < MEM_REC_CNT; i++)
+ {
+ if(ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t))
+ {
+ ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_retrieve_mem_rec_op_t.u4_error_code |=
+ IVD_MEM_REC_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ }
+ break;
+
+ case IVD_CMD_VIDEO_CTL:
+ {
+ UWORD32 *pu4_ptr_cmd;
+ UWORD32 sub_command;
+
+ pu4_ptr_cmd = (UWORD32 *)pv_api_ip;
+ pu4_ptr_cmd += 2;
+ sub_command = *pu4_ptr_cmd;
+
+ switch(sub_command)
+ {
+ case IVD_CMD_CTL_SETPARAMS:
+ {
+ ihevcd_cxa_ctl_set_config_ip_t *ps_ip;
+ ihevcd_cxa_ctl_set_config_op_t *ps_op;
+ ps_ip = (ihevcd_cxa_ctl_set_config_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_set_config_op_t *)pv_api_op;
+
+ if(ps_ip->s_ivd_ctl_set_config_ip_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_set_config_ip_t))
+ {
+ ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ //no break; is needed here
+ case IVD_CMD_CTL_SETDEFAULT:
+ {
+ ihevcd_cxa_ctl_set_config_op_t *ps_op;
+ ps_op = (ihevcd_cxa_ctl_set_config_op_t *)pv_api_op;
+ if(ps_op->s_ivd_ctl_set_config_op_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_set_config_op_t))
+ {
+ ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_set_config_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ break;
+
+ case IVD_CMD_CTL_GETPARAMS:
+ {
+ ihevcd_cxa_ctl_getstatus_ip_t *ps_ip;
+ ihevcd_cxa_ctl_getstatus_op_t *ps_op;
+
+ ps_ip = (ihevcd_cxa_ctl_getstatus_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_getstatus_op_t *)pv_api_op;
+ if(ps_ip->s_ivd_ctl_getstatus_ip_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_getstatus_ip_t))
+ {
+ ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ if(ps_op->s_ivd_ctl_getstatus_op_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_getstatus_op_t))
+ {
+ ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_getstatus_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ break;
+
+ case IVD_CMD_CTL_GETBUFINFO:
+ {
+ ihevcd_cxa_ctl_getbufinfo_ip_t *ps_ip;
+ ihevcd_cxa_ctl_getbufinfo_op_t *ps_op;
+ ps_ip = (ihevcd_cxa_ctl_getbufinfo_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_getbufinfo_op_t *)pv_api_op;
+
+ if(ps_ip->s_ivd_ctl_getbufinfo_ip_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_getbufinfo_ip_t))
+ {
+ ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ if(ps_op->s_ivd_ctl_getbufinfo_op_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_getbufinfo_op_t))
+ {
+ ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_getbufinfo_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ break;
+
+ case IVD_CMD_CTL_GETVERSION:
+ {
+ ihevcd_cxa_ctl_getversioninfo_ip_t *ps_ip;
+ ihevcd_cxa_ctl_getversioninfo_op_t *ps_op;
+ ps_ip = (ihevcd_cxa_ctl_getversioninfo_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_getversioninfo_op_t *)pv_api_op;
+ if(ps_ip->s_ivd_ctl_getversioninfo_ip_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_getversioninfo_ip_t))
+ {
+ ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ if(ps_op->s_ivd_ctl_getversioninfo_op_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_getversioninfo_op_t))
+ {
+ ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_getversioninfo_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ break;
+
+ case IVD_CMD_CTL_FLUSH:
+ {
+ ihevcd_cxa_ctl_flush_ip_t *ps_ip;
+ ihevcd_cxa_ctl_flush_op_t *ps_op;
+ ps_ip = (ihevcd_cxa_ctl_flush_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_flush_op_t *)pv_api_op;
+ if(ps_ip->s_ivd_ctl_flush_ip_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_flush_ip_t))
+ {
+ ps_op->s_ivd_ctl_flush_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_flush_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ if(ps_op->s_ivd_ctl_flush_op_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_flush_op_t))
+ {
+ ps_op->s_ivd_ctl_flush_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_flush_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ break;
+
+ case IVD_CMD_CTL_RESET:
+ {
+ ihevcd_cxa_ctl_reset_ip_t *ps_ip;
+ ihevcd_cxa_ctl_reset_op_t *ps_op;
+ ps_ip = (ihevcd_cxa_ctl_reset_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_reset_op_t *)pv_api_op;
+ if(ps_ip->s_ivd_ctl_reset_ip_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_reset_ip_t))
+ {
+ ps_op->s_ivd_ctl_reset_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_reset_op_t.u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ if(ps_op->s_ivd_ctl_reset_op_t.u4_size
+ != sizeof(ihevcd_cxa_ctl_reset_op_t))
+ {
+ ps_op->s_ivd_ctl_reset_op_t.u4_error_code |= 1
+ << IVD_UNSUPPORTEDPARAM;
+ ps_op->s_ivd_ctl_reset_op_t.u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ }
+ break;
+ case IHEVCD_CXA_CMD_CTL_DEGRADE:
+ {
+ ihevcd_cxa_ctl_degrade_ip_t *ps_ip;
+ ihevcd_cxa_ctl_degrade_op_t *ps_op;
+
+ ps_ip = (ihevcd_cxa_ctl_degrade_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_degrade_op_t *)pv_api_op;
+
+ if(ps_ip->u4_size
+ != sizeof(ihevcd_cxa_ctl_degrade_ip_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ if(ps_op->u4_size
+ != sizeof(ihevcd_cxa_ctl_degrade_op_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ if((ps_ip->i4_degrade_pics < 0) ||
+ (ps_ip->i4_degrade_pics > 4) ||
+ (ps_ip->i4_nondegrade_interval < 0) ||
+ (ps_ip->i4_degrade_type < 0) ||
+ (ps_ip->i4_degrade_type > 15))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ return IV_FAIL;
+ }
+
+ break;
+ }
+
+ case IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS:
+ {
+ ihevcd_cxa_ctl_get_frame_dimensions_ip_t *ps_ip;
+ ihevcd_cxa_ctl_get_frame_dimensions_op_t *ps_op;
+
+ ps_ip =
+ (ihevcd_cxa_ctl_get_frame_dimensions_ip_t *)pv_api_ip;
+ ps_op =
+ (ihevcd_cxa_ctl_get_frame_dimensions_op_t *)pv_api_op;
+
+ if(ps_ip->u4_size
+ != sizeof(ihevcd_cxa_ctl_get_frame_dimensions_ip_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ if(ps_op->u4_size
+ != sizeof(ihevcd_cxa_ctl_get_frame_dimensions_op_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ break;
+ }
+
+ case IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS:
+ {
+ ihevcd_cxa_ctl_get_vui_params_ip_t *ps_ip;
+ ihevcd_cxa_ctl_get_vui_params_op_t *ps_op;
+
+ ps_ip =
+ (ihevcd_cxa_ctl_get_vui_params_ip_t *)pv_api_ip;
+ ps_op =
+ (ihevcd_cxa_ctl_get_vui_params_op_t *)pv_api_op;
+
+ if(ps_ip->u4_size
+ != sizeof(ihevcd_cxa_ctl_get_vui_params_ip_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ if(ps_op->u4_size
+ != sizeof(ihevcd_cxa_ctl_get_vui_params_op_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ break;
+ }
+ case IHEVCD_CXA_CMD_CTL_SET_NUM_CORES:
+ {
+ ihevcd_cxa_ctl_set_num_cores_ip_t *ps_ip;
+ ihevcd_cxa_ctl_set_num_cores_op_t *ps_op;
+
+ ps_ip = (ihevcd_cxa_ctl_set_num_cores_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_set_num_cores_op_t *)pv_api_op;
+
+ if(ps_ip->u4_size
+ != sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ if(ps_op->u4_size
+ != sizeof(ihevcd_cxa_ctl_set_num_cores_op_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+#ifdef MULTICORE
+ if((ps_ip->u4_num_cores < 1) || (ps_ip->u4_num_cores > MAX_NUM_CORES))
+#else
+ if(ps_ip->u4_num_cores != 1)
+#endif
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ return IV_FAIL;
+ }
+ break;
+ }
+ case IHEVCD_CXA_CMD_CTL_SET_PROCESSOR:
+ {
+ ihevcd_cxa_ctl_set_processor_ip_t *ps_ip;
+ ihevcd_cxa_ctl_set_processor_op_t *ps_op;
+
+ ps_ip = (ihevcd_cxa_ctl_set_processor_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_set_processor_op_t *)pv_api_op;
+
+ if(ps_ip->u4_size
+ != sizeof(ihevcd_cxa_ctl_set_processor_ip_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ if(ps_op->u4_size
+ != sizeof(ihevcd_cxa_ctl_set_processor_op_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ break;
+ }
+#ifdef GPU_BUILD
+ case IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE:
+ {
+ ihevcd_cxa_ctl_gpu_enable_diable_ip_t *ps_ip;
+ ihevcd_cxa_ctl_gpu_enable_diable_op_t *ps_op;
+
+ ps_ip = (ihevcd_cxa_ctl_gpu_enable_diable_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_gpu_enable_diable_op_t *)pv_api_op;
+
+ if(ps_ip->u4_size
+ != sizeof(ihevcd_cxa_ctl_gpu_enable_diable_ip_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_IP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+
+ if(ps_op->u4_size
+ != sizeof(ihevcd_cxa_ctl_gpu_enable_diable_op_t))
+ {
+ ps_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_op->u4_error_code |=
+ IVD_OP_API_STRUCT_SIZE_INCORRECT;
+ return IV_FAIL;
+ }
+ break;
+ }
+#endif
+ default:
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_UNSUPPORTED_API_CMD;
+ return IV_FAIL;
+ }
+ }
+ break;
+ default:
+ *(pu4_api_op + 1) |= 1 << IVD_UNSUPPORTEDPARAM;
+ *(pu4_api_op + 1) |= IVD_UNSUPPORTED_API_CMD;
+ return IV_FAIL;
+ }
+
+ return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets default dynamic parameters
+*
+* @par Description:
+* Sets default dynamic parameters. Will be called in ihevcd_init() to ensure
+* that even if set_params is not called, codec continues to work
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_default_params(codec_t *ps_codec)
+{
+
+ WORD32 ret = IV_SUCCESS;
+
+ ps_codec->e_pic_skip_mode = IVD_SKIP_NONE;
+ ps_codec->i4_strd = 0;
+ ps_codec->i4_disp_strd = 0;
+ ps_codec->i4_header_mode = 0;
+ ps_codec->e_pic_out_order = IVD_DISPLAY_FRAME_OUT;
+ return ret;
+}
+
+void ihevcd_update_function_ptr(codec_t *ps_codec)
+{
+
+ /* Init inter pred function array */
+ ps_codec->apf_inter_pred[0] = NULL;
+ ps_codec->apf_inter_pred[1] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr;
+ ps_codec->apf_inter_pred[2] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr;
+ ps_codec->apf_inter_pred[3] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr;
+ ps_codec->apf_inter_pred[4] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr;
+ ps_codec->apf_inter_pred[5] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr;
+ ps_codec->apf_inter_pred[6] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr;
+ ps_codec->apf_inter_pred[7] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr;
+ ps_codec->apf_inter_pred[8] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr;
+ ps_codec->apf_inter_pred[9] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr;
+ ps_codec->apf_inter_pred[10] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr;
+ ps_codec->apf_inter_pred[11] = NULL;
+ ps_codec->apf_inter_pred[12] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr;
+ ps_codec->apf_inter_pred[13] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr;
+ ps_codec->apf_inter_pred[14] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr;
+ ps_codec->apf_inter_pred[15] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr;
+ ps_codec->apf_inter_pred[16] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr;
+ ps_codec->apf_inter_pred[17] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr;
+ ps_codec->apf_inter_pred[18] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr;
+ ps_codec->apf_inter_pred[19] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr;
+ ps_codec->apf_inter_pred[20] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr;
+ ps_codec->apf_inter_pred[21] = (pf_inter_pred)ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr;
+
+ /* Init intra pred function array */
+ ps_codec->apf_intra_pred_luma[0] = (pf_intra_pred)NULL;
+ ps_codec->apf_intra_pred_luma[1] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr;
+ ps_codec->apf_intra_pred_luma[2] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr;
+ ps_codec->apf_intra_pred_luma[3] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr;
+ ps_codec->apf_intra_pred_luma[4] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr;
+ ps_codec->apf_intra_pred_luma[5] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr;
+ ps_codec->apf_intra_pred_luma[6] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr;
+ ps_codec->apf_intra_pred_luma[7] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr;
+ ps_codec->apf_intra_pred_luma[8] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr;
+ ps_codec->apf_intra_pred_luma[9] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr;
+ ps_codec->apf_intra_pred_luma[10] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr;
+
+ ps_codec->apf_intra_pred_chroma[0] = (pf_intra_pred)NULL;
+ ps_codec->apf_intra_pred_chroma[1] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr;
+ ps_codec->apf_intra_pred_chroma[2] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr;
+ ps_codec->apf_intra_pred_chroma[3] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr;
+ ps_codec->apf_intra_pred_chroma[4] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr;
+ ps_codec->apf_intra_pred_chroma[5] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr;
+ ps_codec->apf_intra_pred_chroma[6] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr;
+ ps_codec->apf_intra_pred_chroma[7] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr;
+ ps_codec->apf_intra_pred_chroma[8] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr;
+ ps_codec->apf_intra_pred_chroma[9] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr;
+ ps_codec->apf_intra_pred_chroma[10] = (pf_intra_pred)ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr;
+
+ /* Init itrans_recon function array */
+ ps_codec->apf_itrans_recon[0] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr;
+ ps_codec->apf_itrans_recon[1] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr;
+ ps_codec->apf_itrans_recon[2] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr;
+ ps_codec->apf_itrans_recon[3] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr;
+ ps_codec->apf_itrans_recon[4] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr;
+ ps_codec->apf_itrans_recon[5] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr;
+ ps_codec->apf_itrans_recon[6] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr;
+ ps_codec->apf_itrans_recon[7] = (pf_itrans_recon)ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr;
+
+ /* Init recon function array */
+ ps_codec->apf_recon[0] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr;
+ ps_codec->apf_recon[1] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_4x4_fptr;
+ ps_codec->apf_recon[2] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_8x8_fptr;
+ ps_codec->apf_recon[3] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_16x16_fptr;
+ ps_codec->apf_recon[4] = (pf_recon)ps_codec->s_func_selector.ihevc_recon_32x32_fptr;
+ ps_codec->apf_recon[5] = (pf_recon)ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr;
+ ps_codec->apf_recon[6] = (pf_recon)ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr;
+ ps_codec->apf_recon[7] = (pf_recon)ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr;
+
+ /* Init itrans_recon_dc function array */
+ ps_codec->apf_itrans_recon_dc[0] = (pf_itrans_recon_dc)ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr;
+ ps_codec->apf_itrans_recon_dc[1] = (pf_itrans_recon_dc)ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr;
+
+ /* Init sao function array */
+ ps_codec->apf_sao_luma[0] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr;
+ ps_codec->apf_sao_luma[1] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr;
+ ps_codec->apf_sao_luma[2] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr;
+ ps_codec->apf_sao_luma[3] = (pf_sao_luma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr;
+
+ ps_codec->apf_sao_chroma[0] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr;
+ ps_codec->apf_sao_chroma[1] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr;
+ ps_codec->apf_sao_chroma[2] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr;
+ ps_codec->apf_sao_chroma[3] = (pf_sao_chroma)ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Initialize the context. This will be called by init_mem_rec and during
+* reset
+*
+* @par Description:
+* Initializes the context
+*
+* @param[in] ps_codec
+* Codec context pointer
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_init(codec_t *ps_codec)
+{
+ WORD32 status = IV_SUCCESS;
+ WORD32 i;
+
+
+ ps_codec->i4_num_disp_bufs = 1;
+ ps_codec->i4_flush_mode = 0;
+
+ ps_codec->i4_ht = ps_codec->i4_disp_ht = ps_codec->i4_max_ht;
+ ps_codec->i4_wd = ps_codec->i4_disp_wd = ps_codec->i4_max_wd;
+ ps_codec->i4_strd = 0;
+ ps_codec->i4_disp_strd = 0;
+ ps_codec->i4_num_cores = 1;
+
+ ps_codec->u4_pic_cnt = 0;
+ ps_codec->u4_disp_cnt = 0;
+
+ ps_codec->i4_header_mode = 0;
+ ps_codec->i4_header_in_slice_mode = 0;
+ ps_codec->i4_sps_done = 0;
+ ps_codec->i4_pps_done = 0;
+ ps_codec->i4_init_done = 1;
+ ps_codec->i4_first_pic_done = 0;
+ ps_codec->s_parse.i4_first_pic_init = 0;
+ ps_codec->i4_error_code = 0;
+ ps_codec->i4_reset_flag = 0;
+
+ ps_codec->i4_prev_poc_msb = 0;
+ ps_codec->i4_prev_poc_lsb = -1;
+ ps_codec->i4_max_prev_poc_lsb = -1;
+ ps_codec->s_parse.i4_abs_pic_order_cnt = -1;
+
+ /* Set ref chroma format by default to 420SP UV interleaved */
+ ps_codec->e_ref_chroma_fmt = IV_YUV_420SP_UV;
+
+#ifdef GPU_BUILD
+#ifndef FRAME_STAGGER_ONLY
+ /* Flag to switch bw MC on GPU and CPU. GPU disabled functionality
+ * not tested. Later move the flag to dynamic parameters.
+ * By default disable GPU. App has to enable GPU thro CNT call.
+ */
+ ps_codec->u4_gpu_enabled = 0;
+#else
+ ps_codec->u4_gpu_enabled = 0;
+#endif
+
+ ps_codec->u4_parsing_view = 0;
+
+#endif
+ /* If the codec is in shared mode and required format is 420 SP VU interleaved then change
+ * reference buffers chroma format
+ */
+ if(IV_YUV_420SP_VU == ps_codec->e_chroma_fmt)
+ {
+ ps_codec->e_ref_chroma_fmt = IV_YUV_420SP_VU;
+ }
+
+
+
+ ps_codec->i4_disable_deblk_pic = 0;
+
+ ps_codec->i4_degrade_pic_cnt = 0;
+ ps_codec->i4_degrade_pics = 0;
+ ps_codec->i4_degrade_type = 0;
+ ps_codec->i4_disable_sao_pic = 0;
+ ps_codec->i4_fullpel_inter_pred = 0;
+ ps_codec->u4_enable_fmt_conv_ahead = 0;
+
+ {
+ sps_t *ps_sps = ps_codec->ps_sps_base;
+ pps_t *ps_pps = ps_codec->ps_pps_base;
+
+ for(i = 0; i < MAX_SPS_CNT; i++)
+ {
+ ps_sps->i1_sps_valid = 0;
+ ps_sps++;
+ }
+
+ for(i = 0; i < MAX_PPS_CNT; i++)
+ {
+ ps_pps->i1_pps_valid = 0;
+ ps_pps++;
+ }
+ }
+
+ ihevcd_set_default_params(ps_codec);
+ ps_codec->pv_proc_jobq = ihevcd_jobq_init(ps_codec->pv_proc_jobq_buf, ps_codec->i4_proc_jobq_buf_size);
+ RETURN_IF((ps_codec->pv_proc_jobq == NULL), IV_FAIL);
+
+ /* Update the jobq context to all the threads */
+ ps_codec->s_parse.pv_proc_jobq = ps_codec->pv_proc_jobq;
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].pv_proc_jobq = ps_codec->pv_proc_jobq;
+ ps_codec->as_process[i].i4_id = i;
+ ps_codec->as_process[i].ps_codec = ps_codec;
+
+ /* Set the following to zero assuming it is a single core solution
+ * When threads are launched these will be set appropriately
+ */
+ ps_codec->as_process[i].i4_check_parse_status = 0;
+ ps_codec->as_process[i].i4_check_proc_status = 0;
+ }
+ /* Initialize MV Bank buffer manager */
+ ihevc_buf_mgr_init((buf_mgr_t *)ps_codec->pv_mv_buf_mgr);
+
+ /* Initialize Picture buffer manager */
+ ihevc_buf_mgr_init((buf_mgr_t *)ps_codec->pv_pic_buf_mgr);
+
+ ps_codec->ps_pic_buf = (pic_buf_t *)ps_codec->pv_pic_buf_base;
+
+ memset(ps_codec->ps_pic_buf, 0, BUF_MGR_MAX_CNT * sizeof(pic_buf_t));
+
+
+
+ /* Initialize display buffer manager */
+ ihevc_disp_mgr_init((disp_mgr_t *)ps_codec->pv_disp_buf_mgr);
+
+ /* Initialize dpb manager */
+ ihevc_dpb_mgr_init((dpb_mgr_t *)ps_codec->pv_dpb_mgr);
+
+ ps_codec->e_processor_soc = SOC_GENERIC;
+ /* The following can be over-ridden using soc parameter as a hack */
+ ps_codec->u4_nctb = 0x7FFFFFFF;
+ ihevcd_init_arch(ps_codec);
+
+ ihevcd_init_function_ptr(ps_codec);
+
+ ihevcd_update_function_ptr(ps_codec);
+
+ return status;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets number of memory records required by the codec
+*
+* @par Description:
+* Gets codec mem record requirements and adds concealment modules
+* requirements
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_num_rec(void *pv_api_ip, void *pv_api_op)
+{
+
+ iv_num_mem_rec_op_t *ps_mem_q_op;
+
+ UNUSED(pv_api_ip);
+ ps_mem_q_op = (iv_num_mem_rec_op_t *)pv_api_op;
+ ps_mem_q_op->u4_num_mem_rec = MEM_REC_CNT;
+ DEBUG("Get num mem records without concealment %d\n",
+ ps_mem_q_op->u4_num_mem_rec);
+#ifdef APPLY_CONCEALMENT
+ {
+ IV_API_CALL_STATUS_T status;
+ icncl_num_mem_rec_ip_t cncl_mem_ip;
+ icncl_num_mem_rec_op_t cncl_mem_op;
+
+ cncl_mem_ip.s_ivd_num_rec_ip_t.e_cmd = IV_CMD_GET_NUM_MEM_REC;
+ cncl_mem_ip.s_ivd_num_rec_ip_t.u4_size = sizeof(icncl_num_mem_rec_ip_t);
+
+ status = icncl_api_function(NULL, (void *)&cncl_mem_ip, (void *)&cncl_mem_op);
+
+ if(status == IV_SUCCESS)
+ {
+ /* Add the concealment library's memory requirements */
+ ps_mem_q_op->u4_num_mem_rec += cncl_mem_op.s_ivd_num_mem_rec_op_t.u4_num_mem_rec;
+ DEBUG("Get num mem records %d\n", ps_mem_q_op->u4_num_mem_rec);
+ return status; /* Nothing else to do, return */
+ }
+ else
+ {
+ /*
+ * Something went wrong with the concealment library call.
+ */
+ DEBUG("ERROR: Get num mem records %d\n", ps_mem_q_op->u4_num_mem_rec);
+ return status;
+ }
+
+ }
+#endif //APPLY_CONCEALMENT
+
+
+ return IV_SUCCESS;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Fills memory requirements of the codec
+*
+* @par Description:
+* Gets codec mem record requirements and adds concealment modules
+* requirements
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op)
+{
+
+ ihevcd_cxa_fill_mem_rec_ip_t *ps_mem_q_ip;
+ ihevcd_cxa_fill_mem_rec_op_t *ps_mem_q_op;
+ WORD32 level;
+ WORD32 num_reorder_frames;
+ WORD32 num_ref_frames;
+ WORD32 num_extra_disp_bufs;
+ WORD32 max_dpb_size;
+
+ iv_mem_rec_t *ps_mem_rec;
+ iv_mem_rec_t *ps_mem_rec_base;
+ WORD32 no_of_mem_rec_filled;
+ WORD32 chroma_format, share_disp_buf;
+ WORD32 max_ctb_cnt;
+ WORD32 max_wd_luma, max_wd_chroma;
+ WORD32 max_ht_luma, max_ht_chroma;
+ WORD32 max_tile_cols, max_tile_rows;
+ WORD32 max_ctb_rows, max_ctb_cols;
+ WORD32 max_num_cu_cols;
+ WORD32 i;
+ WORD32 max_num_4x4_cols;
+ IV_API_CALL_STATUS_T status = IV_SUCCESS;
+ no_of_mem_rec_filled = 0;
+
+ //TODO: Remove as and when the following are used
+ UNUSED(num_extra_disp_bufs);
+ UNUSED(no_of_mem_rec_filled);
+ UNUSED(max_wd_chroma);
+ UNUSED(max_ht_chroma);
+
+ ps_mem_q_ip = (ihevcd_cxa_fill_mem_rec_ip_t *)pv_api_ip;
+ ps_mem_q_op = (ihevcd_cxa_fill_mem_rec_op_t *)pv_api_op;
+
+ if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, i4_level))
+ {
+ level = ps_mem_q_ip->i4_level;
+ /* Spec requires level should be multiplied by 30
+ * API has values where level is multiplied by 10. This keeps it consistent with H264
+ * Because of the above differences, level is multiplied by 3 here.
+ */
+ level *= 3;
+ }
+ else
+ {
+ level = MAX_LEVEL;
+ }
+
+ if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ > offsetof(ihevcd_cxa_fill_mem_rec_ip_t,
+ u4_num_reorder_frames))
+ {
+ num_reorder_frames = ps_mem_q_ip->u4_num_reorder_frames;
+ }
+ else
+ {
+ num_reorder_frames = MAX_REF_CNT;
+ }
+
+ if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, u4_num_ref_frames))
+ {
+ num_ref_frames = ps_mem_q_ip->u4_num_ref_frames;
+ }
+ else
+ {
+ num_ref_frames = MAX_REF_CNT;
+ }
+
+ if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ > offsetof(ihevcd_cxa_fill_mem_rec_ip_t,
+ u4_num_extra_disp_buf))
+ {
+ num_extra_disp_bufs = ps_mem_q_ip->u4_num_extra_disp_buf;
+ }
+ else
+ {
+ num_extra_disp_bufs = 0;
+ }
+
+ if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, u4_share_disp_buf))
+ {
+#ifndef LOGO_EN
+ share_disp_buf = ps_mem_q_ip->u4_share_disp_buf;
+#else
+ share_disp_buf = 0;
+#endif
+ }
+ else
+ {
+ share_disp_buf = 0;
+ }
+
+ if(ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size
+ > offsetof(ihevcd_cxa_fill_mem_rec_ip_t, e_output_format))
+ {
+ chroma_format = ps_mem_q_ip->e_output_format;
+ }
+ else
+ {
+ chroma_format = -1;
+ }
+
+ /* Shared disp buffer mode is supported only for 420SP formats */
+ if((chroma_format != IV_YUV_420P) &&
+ (chroma_format != IV_YUV_420SP_UV) &&
+ (chroma_format != IV_YUV_420SP_VU))
+ {
+ share_disp_buf = 0;
+ }
+
+ {
+
+ max_ht_luma = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht;
+ max_wd_luma = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd;
+
+ max_ht_luma = ALIGN64(max_ht_luma);
+ max_wd_luma = ALIGN64(max_wd_luma);
+
+
+
+ max_tile_cols = (max_wd_luma + MIN_TILE_WD - 1) / MIN_TILE_WD;
+ max_tile_rows = (max_ht_luma + MIN_TILE_HT - 1) / MIN_TILE_HT;
+ max_ctb_rows = max_ht_luma / MIN_CTB_SIZE;
+ max_ctb_cols = max_wd_luma / MIN_CTB_SIZE;
+ max_ctb_cnt = max_ctb_rows * max_ctb_cols;
+ max_num_cu_cols = max_wd_luma / MIN_CU_SIZE;
+ max_num_4x4_cols = max_wd_luma / 4;
+ }
+ /*
+ * If level is lesser than 31 and the resolution required is higher,
+ * then make the level at least 31.
+ */
+ /* if (num_mbs > MAX_NUM_MBS_3_0 && level < MAX_LEVEL)
+ {
+ level = MAX_LEVEL;
+ }
+ */
+ if((level < MIN_LEVEL) || (level > MAX_LEVEL))
+ {
+ ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IHEVCD_LEVEL_UNSUPPORTED;
+ level = MAX_LEVEL;
+ }
+ if(num_ref_frames > MAX_REF_CNT)
+ {
+ ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IHEVCD_NUM_REF_UNSUPPORTED;
+ num_ref_frames = MAX_REF_CNT;
+ }
+
+ if(num_reorder_frames > MAX_REF_CNT)
+ {
+ ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_error_code |=
+ IHEVCD_NUM_REORDER_UNSUPPORTED;
+ num_reorder_frames = MAX_REF_CNT;
+ }
+
+ max_dpb_size = ihevcd_get_dpb_size(level, max_wd_luma * max_ht_luma);
+ ps_mem_rec_base = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location;
+
+ /* Set all memory reconds as persistent and alignment as 128
+ * by default
+ */
+ ps_mem_rec = ps_mem_rec_base;
+ for(i = 0; i < MEM_REC_CNT; i++)
+ {
+ ps_mem_rec->u4_mem_alignment = 128;
+ ps_mem_rec->e_mem_type = IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM;
+ ps_mem_rec++;
+ }
+
+ /* Request memory for HEVCD object */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_IV_OBJ];
+ ps_mem_rec->u4_mem_size = sizeof(iv_obj_t);
+
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_IV_OBJ,
+ ps_mem_rec->u4_mem_size);
+
+ /* Request memory for HEVC Codec context */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_CODEC];
+ ps_mem_rec->u4_mem_size = sizeof(codec_t);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CODEC,
+ ps_mem_rec->u4_mem_size);
+
+ /* Request memory for buffer which holds bitstream after emulation prevention */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_BITSBUF];
+ ps_mem_rec->u4_mem_size = MAX((max_wd_luma * max_ht_luma), MIN_BITSBUF_SIZE);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BITSBUF,
+ ps_mem_rec->u4_mem_size);
+
+ /* Request memory for buffer which holds TU structures and coeff data for
+ * a set of CTBs in the current picture */
+ /*TODO Currently the buffer is allocated at a frame level. Reduce this to
+ * allocate for s set of CTBs and add appropriate synchronization logic to
+ * ensure that this is data is not overwritten before consumption
+ */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TU_DATA];
+ ps_mem_rec->u4_mem_size = ihevcd_get_tu_data_size(max_wd_luma * max_ht_luma);
+#ifdef GPU_BUILD
+ /* For ping-pong view */
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TU_DATA,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBANK];
+
+ ps_mem_rec->u4_mem_size = sizeof(buf_mgr_t);
+
+ /* Size for holding mv_buf_t for each MV Bank */
+ /* Note this allocation is done for BUF_MGR_MAX_CNT instead of
+ * max_dpb_size or MAX_DPB_SIZE for following reasons
+ * max_dpb_size will be based on max_wd and max_ht
+ * For higher max_wd and max_ht this number will be smaller than MAX_DPB_SIZE
+ * But during actual initialization number of buffers allocated can be more
+ *
+ * One extra MV Bank is needed to hold current pics MV bank.
+ * Since this is only a structure allocation and not actual buffer allocation,
+ * it is allocated for BUF_MGR_MAX_CNT entries
+ */
+ ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(mv_buf_t);
+#ifdef GPU_BUILD
+ /* Request one extra since release is delayed by one frame.*/
+ ps_mem_rec->u4_mem_size += sizeof(mv_buf_t);
+#endif
+
+ {
+ /* Allocate for pu_map, pu_t and pic_pu_idx for each MV bank */
+ /* Note: Number of luma samples is not max_wd * max_ht here, instead it is
+ * set to maximum number of luma samples allowed at the given level.
+ * This is done to ensure that any stream with width and height lesser
+ * than max_wd and max_ht is supported. Number of buffers required can be greater
+ * for lower width and heights at a given level and this increased number of buffers
+ * might require more memory than what max_wd and max_ht buffer would have required
+ * Also note one extra buffer is allocted to store current pictures MV bank
+ * In case of asynchronous parsing and processing, number of buffers should increase here
+ * based on when parsing and processing threads are synchronized
+ */
+ WORD32 lvl_idx = ihevcd_get_lvl_idx(level);
+ WORD32 max_luma_samples = gai4_ihevc_max_luma_pic_size[lvl_idx];
+#ifdef GPU_BUILD
+ ps_mem_rec->u4_mem_size += (max_dpb_size + 2) *
+ ihevcd_get_pic_mv_bank_size(max_luma_samples);
+#else
+ ps_mem_rec->u4_mem_size += (max_dpb_size + 1) *
+ ihevcd_get_pic_mv_bank_size(max_luma_samples);
+#endif
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MVBANK,
+ ps_mem_rec->u4_mem_size);
+ }
+ // TODO GPU : Have to creat ping-pong view for VPS,SPS,PPS.
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_VPS];
+ ps_mem_rec->u4_mem_size = MAX_VPS_CNT * sizeof(vps_t);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_VPS,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SPS];
+ ps_mem_rec->u4_mem_size = MAX_SPS_CNT * sizeof(sps_t);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SPS,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PPS];
+ ps_mem_rec->u4_mem_size = MAX_PPS_CNT * sizeof(pps_t);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PPS,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_HDR];
+ ps_mem_rec->u4_mem_size = MAX_SLICE_HDR_CNT * sizeof(slice_header_t);
+#ifdef GPU_BUILD
+ /* OpenCL ping pong buffer */
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size *= 2;
+#endif
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SLICE_HDR,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE];
+ {
+ WORD32 tile_size;
+
+ tile_size = max_tile_cols * max_tile_rows;
+ tile_size *= sizeof(tile_t);
+
+
+ ps_mem_rec->u4_mem_size = MAX_PPS_CNT * tile_size;
+ }
+
+
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TILE,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTRY_OFST];
+ {
+ WORD32 num_entry_points;
+
+ /* One entry point per tile */
+ num_entry_points = max_tile_cols * max_tile_rows;
+
+ /* One entry point per row of CTBs */
+ /*********************************************************************/
+ /* Only tiles or entropy sync is enabled at a time in main */
+ /* profile, but since memory required does not increase too much, */
+ /* this allocation is done to handle both cases */
+ /*********************************************************************/
+ num_entry_points += max_ctb_rows;
+
+
+ ps_mem_rec->u4_mem_size = sizeof(WORD32) * num_entry_points;
+ }
+
+
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_ENTRY_OFST,
+ ps_mem_rec->u4_mem_size);
+
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SCALING_MAT];
+ {
+ WORD32 scaling_mat_size;
+
+ SCALING_MAT_SIZE(scaling_mat_size)
+ ps_mem_rec->u4_mem_size = (MAX_SPS_CNT + MAX_PPS_CNT) * scaling_mat_size * sizeof(WORD16);
+ }
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SCALING_MAT,
+ ps_mem_rec->u4_mem_size);
+
+ /* Holds one row skip_flag at 8x8 level used during parsing */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_SKIP_FLAG];
+
+ /* 1 bit per 8x8 */
+ ps_mem_rec->u4_mem_size = max_num_cu_cols / 8;
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_SKIP_FLAG,
+ ps_mem_rec->u4_mem_size);
+
+ /* Holds one row skip_flag at 8x8 level used during parsing */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_CT_DEPTH];
+
+ /* 2 bits per 8x8 */
+ ps_mem_rec->u4_mem_size = max_num_cu_cols / 4;
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_CT_DEPTH,
+ ps_mem_rec->u4_mem_size);
+
+ /* Holds one row skip_flag at 8x8 level used during parsing */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_INTRA_PRED_MODE];
+
+ /* 8 bits per 4x4 */
+ /* 16 bytes each for top and left 64 pixels and 16 bytes for default mode */
+ ps_mem_rec->u4_mem_size = 3 * 16 * sizeof(UWORD8);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_INTRA_PRED_MODE,
+ ps_mem_rec->u4_mem_size);
+
+ /* Holds one intra mode at 8x8 level for entire picture */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_INTRA_FLAG];
+
+ /* 1 bit per 8x8 */
+ ps_mem_rec->u4_mem_size = (max_wd_luma / MIN_CU_SIZE) * (max_ht_luma / MIN_CU_SIZE) / 8;
+#ifdef GPU_BUILD
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_INTRA_FLAG,
+ ps_mem_rec->u4_mem_size);
+
+ /* Holds one transquant bypass flag at 8x8 level for entire picture */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TRANSQUANT_BYPASS_FLAG];
+
+ /* 1 bit per 8x8 */
+ /* Extra row and column are allocated for easy processing of top and left blocks while loop filtering */
+ ps_mem_rec->u4_mem_size = ((max_wd_luma + 64) / MIN_CU_SIZE) * ((max_ht_luma + 64) / MIN_CU_SIZE) / 8;
+#ifdef GPU_BUILD
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TRANSQUANT_BYPASS_FLAG,
+ ps_mem_rec->u4_mem_size);
+
+ /* Request memory to hold thread handles for each processing thread */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_THREAD_HANDLE];
+ ps_mem_rec->u4_mem_size = MAX_PROCESS_THREADS * ithread_get_handle_size();
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_THREAD_HANDLE,
+ ps_mem_rec->u4_mem_size);
+
+
+ {
+ WORD32 job_queue_size;
+ WORD32 num_jobs;
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_JOBQ];
+
+
+ /* One job per row of CTBs */
+ num_jobs = max_ctb_rows;
+
+ /* One each tile a row of CTBs, num_jobs has to incremented */
+ num_jobs *= max_tile_cols;
+
+ /* One format convert/frame copy job per row of CTBs for non-shared mode*/
+ num_jobs += max_ctb_rows;
+
+#ifdef GPU_BUILD
+ num_jobs *= 2;
+#endif
+
+ job_queue_size = ihevcd_jobq_ctxt_size();
+ job_queue_size += num_jobs * sizeof(proc_job_t);
+ ps_mem_rec->u4_mem_size = job_queue_size;
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_JOBQ,
+ ps_mem_rec->u4_mem_size);
+ }
+
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_MAP];
+ ps_mem_rec->u4_mem_size = max_ctb_cnt;
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PARSE_MAP,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_MAP];
+ ps_mem_rec->u4_mem_size = max_ctb_cnt;
+#ifdef GPU_BUILD
+ /* OpenCL PING PONG buffer */
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size *= 2;
+#endif
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_MAP,
+ ps_mem_rec->u4_mem_size);
+
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_DISP_MGR];
+
+ /* size for holding display manager context */
+ ps_mem_rec->u4_mem_size = sizeof(buf_mgr_t);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_DISP_MGR,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_DPB_MGR];
+
+ /* size for holding dpb manager context */
+ ps_mem_rec->u4_mem_size = sizeof(dpb_mgr_t);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_DPB_MGR,
+ ps_mem_rec->u4_mem_size);
+
+ /** Holds top and left neighbor's pu idx into picture level pu array */
+ /* Only one top row is enough but left has to be replicated for each process context */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PIC_PU_IDX_NEIGHBOR];
+
+ ps_mem_rec->u4_mem_size = (max_num_4x4_cols /* left */ + MAX_PROCESS_THREADS * (MAX_CTB_SIZE / 4)/* top */ + 1/* top right */) * sizeof(WORD32);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PIC_PU_IDX_NEIGHBOR,
+ ps_mem_rec->u4_mem_size);
+
+
+
+ /* TO hold scratch buffers needed for each process context */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_SCRATCH];
+ {
+ WORD32 size = 0;
+ WORD32 inter_pred_tmp_buf_size;
+ WORD32 ntaps_luma;
+ WORD32 pu_map_size;
+ WORD32 sao_size = 0;
+ ntaps_luma = 8;
+
+ /* Max inter pred size (number of bytes) */
+ inter_pred_tmp_buf_size = sizeof(WORD16) * (MAX_CTB_SIZE + ntaps_luma) * MAX_CTB_SIZE;
+ inter_pred_tmp_buf_size = ALIGN64(inter_pred_tmp_buf_size);
+
+
+ /* To hold pu_index w.r.t. frame level pu_t array for a CTB at 4x4 level*/
+ /* 16 x 16 4x4 in a CTB of size 64 x 64 and two extra needed for holding
+ * neighbors
+ */
+ pu_map_size = sizeof(WORD32) * (18 * 18);
+
+ pu_map_size = ALIGN64(pu_map_size);
+ size += pu_map_size;
+
+ /* To hold inter pred temporary buffers */
+ size += 2 * inter_pred_tmp_buf_size;
+
+
+ /* Allocate for each process context */
+ size *= MAX_PROCESS_THREADS;
+
+
+#ifdef GPU_SAO_PING_PONG
+ /* To hold SAO left buffer for luma */
+ sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma)) * 2;
+
+ /* To hold SAO left buffer for chroma */
+ sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma)) * 2;
+
+ /* To hold SAO top buffer for luma */
+ sao_size += sizeof(UWORD8) * max_wd_luma * 2;
+
+ /* To hold SAO top buffer for chroma */
+ sao_size += sizeof(UWORD8) * max_wd_luma * 2;
+
+ /* To hold SAO top left luma pixel value for last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+ /* To hold SAO top left chroma pixel value last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2 * 2;
+
+ /* To hold SAO top left pixel luma for current ctb - column array*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+ /* To hold SAO top left pixel chroma for current ctb-column array*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2 * 2;
+
+ /* To hold SAO top right pixel luma pixel value last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_cols * 2;
+
+ /* To hold SAO top right pixel chroma pixel value last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_cols * 2 * 2;
+
+ /*To hold SAO botton bottom left pixels for luma*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+ /*To hold SAO botton bottom left pixels for luma*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2 * 2;
+#else
+ /* To hold SAO left buffer for luma */
+ sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma));
+
+ /* To hold SAO left buffer for chroma */
+ sao_size += sizeof(UWORD8) * (MAX(max_ht_luma, max_wd_luma));
+
+ /* To hold SAO top buffer for luma */
+ sao_size += sizeof(UWORD8) * max_wd_luma;
+
+ /* To hold SAO top buffer for chroma */
+ sao_size += sizeof(UWORD8) * max_wd_luma;
+
+ /* To hold SAO top left luma pixel value for last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows;
+
+ /* To hold SAO top left chroma pixel value last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+ /* To hold SAO top left pixel luma for current ctb - column array*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows;
+
+ /* To hold SAO top left pixel chroma for current ctb-column array*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+
+ /* To hold SAO top right pixel luma pixel value last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_cols;
+
+ /* To hold SAO top right pixel chroma pixel value last output ctb in a row*/
+ sao_size += sizeof(UWORD8) * max_ctb_cols * 2;
+
+ /*To hold SAO botton bottom left pixels for luma*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows;
+
+ /*To hold SAO botton bottom left pixels for luma*/
+ sao_size += sizeof(UWORD8) * max_ctb_rows * 2;
+#endif
+ sao_size = ALIGN64(sao_size);
+ size += sao_size;
+ ps_mem_rec->u4_mem_size = size;
+ }
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_SCRATCH,
+ ps_mem_rec->u4_mem_size);
+
+ /* TO hold scratch buffers needed for each SAO context */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO_SCRATCH];
+ {
+ WORD32 size = 0;
+
+ size = 4 * MAX_CTB_SIZE * MAX_CTB_SIZE;
+
+ /* 2 temporary buffers*/
+ size *= 2;
+
+ size *= MAX_PROCESS_THREADS;
+
+ ps_mem_rec->u4_mem_size = size;
+ }
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SAO_SCRATCH,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_BS_QP];
+ {
+ WORD32 size = 0;
+ WORD32 vert_bs_size, horz_bs_size;
+ WORD32 qp_const_flag_size;
+ WORD32 qp_size, num_8x8;
+
+ /* Max Number of vertical edges */
+ vert_bs_size = max_wd_luma / 8 + MAX_CTB_SIZE / 8;
+
+ /* Max Number of horizontal edges - extra MAX_CTB_SIZE / 8 to handle the last 4 rows separately(shifted CTB processing) */
+ vert_bs_size *= (max_ht_luma + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+ /* Number of bytes */
+ vert_bs_size /= 8;
+
+ /* Two bits per edge */
+ vert_bs_size *= 2;
+
+ /* Max Number of horizontal edges */
+ horz_bs_size = max_ht_luma / 8 + MAX_CTB_SIZE / 8;
+
+ /* Max Number of vertical edges - extra MAX_CTB_SIZE / 8 to handle the last 4 columns separately(shifted CTB processing) */
+ horz_bs_size *= (max_wd_luma + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+ /* Number of bytes */
+ horz_bs_size /= 8;
+
+ /* Two bits per edge */
+ horz_bs_size *= 2;
+
+ /* Max CTBs in a row */
+ qp_const_flag_size = max_wd_luma / MIN_CTB_SIZE + 1 /* The last ctb row deblk is done in last ctb + 1 row.*/;
+
+ /* Max CTBs in a column */
+ qp_const_flag_size *= max_ht_luma / MIN_CTB_SIZE;
+
+ /* Number of bytes */
+ qp_const_flag_size = (qp_const_flag_size + 7) >> 3;
+
+ /* QP changes at CU level - So store at 8x8 level */
+ num_8x8 = (max_ht_luma * max_wd_luma) / (MIN_CU_SIZE * MIN_CU_SIZE);
+ qp_size = num_8x8;
+
+ /* To hold vertical boundary strength */
+ size += vert_bs_size;
+
+ /* To hold horizontal boundary strength */
+ size += horz_bs_size;
+
+ /* To hold QP */
+ size += qp_size;
+
+ /* To hold QP const in CTB flags */
+ size += qp_const_flag_size;
+
+ ps_mem_rec->u4_mem_size = size;
+#ifdef GPU_BUILD
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+ }
+
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BS_QP,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE_IDX];
+ {
+ WORD32 size = 0;
+ /* Max CTBs in a row */
+ size = max_wd_luma / MIN_CTB_SIZE + 2 /* Top row and bottom row extra. This ensures accessing left,top in first row
+ and right in last row will not result in invalid access*/;
+ /* Max CTBs in a column */
+ size *= max_ht_luma / MIN_CTB_SIZE;
+
+ size *= sizeof(UWORD16);
+ ps_mem_rec->u4_mem_size = size;
+#ifdef GPU_BUILD
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size = ps_mem_rec->u4_mem_size * 2;
+#endif
+ }
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TILE_IDX,
+ ps_mem_rec->u4_mem_size);
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO];
+ {
+ UWORD32 size;
+
+ /* 4 bytes per color component per CTB */
+ size = 3 * 4;
+
+ /* MAX number of CTBs in a row */
+ size *= max_wd_luma / MIN_CTB_SIZE;
+
+ /* MAX number of CTBs in a column */
+ size *= max_ht_luma / MIN_CTB_SIZE;
+#ifdef GPU_BUILD
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec->u4_mem_size = size * 2;
+#else
+ ps_mem_rec->u4_mem_size = size;
+#endif
+ }
+
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SAO,
+ ps_mem_rec->u4_mem_size);
+
+#ifdef GPU_BUILD
+ /* Memory record for GPU context */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_GPU];
+ ps_mem_rec->u4_mem_size = ihevcd_gpu_get_ctxt_size();
+
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_GPU,
+ ps_mem_rec->u4_mem_size);
+#endif
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_REF_PIC];
+
+ /* size for holding buffer manager context */
+ ps_mem_rec->u4_mem_size = sizeof(buf_mgr_t);
+
+ /* Size for holding pic_buf_t for each reference picture */
+ /* Note this allocation is done for BUF_MGR_MAX_CNT instead of
+ * max_dpb_size or MAX_DPB_SIZE for following reasons
+ * max_dpb_size will be based on max_wd and max_ht
+ * For higher max_wd and max_ht this number will be smaller than MAX_DPB_SIZE
+ * But during actual initialization number of buffers allocated can be more
+ *
+ * Also to handle display depth application can allocate more than what
+ * codec asks for in case of non-shared mode
+ * Since this is only a structure allocation and not actual buffer allocation,
+ * it is allocated for BUF_MGR_MAX_CNT entries
+ */
+ ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
+
+ /* In case of non-shared mode allocate for reference picture buffers */
+ if(0 == share_disp_buf)
+ {
+ UWORD32 num_reorder_frames_local = num_reorder_frames;
+#ifdef GPU_BUILD
+ // TODO GPU : Increment only if multicore.
+ num_reorder_frames_local += 1;
+#endif
+ /* Note: Number of luma samples is not max_wd * max_ht here, instead it is
+ * set to maximum number of luma samples allowed at the given level.
+ * This is done to ensure that any stream with width and height lesser
+ * than max_wd and max_ht is supported. Number of buffers required can be greater
+ * for lower width and heights at a given level and this increased number of buffers
+ * might require more memory than what max_wd and max_ht buffer would have required
+ * Number of buffers is doubled in order to return one frame at a time instead of sending
+ * multiple outputs during dpb full case.
+ * Also note one extra buffer is allocted to store current picture
+ * In case of asynchronous parsing and processing, number of buffers should increase here
+ * based on when parsing and processing threads are synchronized
+ */
+ ps_mem_rec->u4_mem_size +=
+ ihevcd_get_total_pic_buf_size(max_wd_luma * max_ht_luma, level, PAD_WD, PAD_HT,
+ num_ref_frames, num_reorder_frames_local);
+ }
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_REF_PIC,
+ ps_mem_rec->u4_mem_size);
+
+ /* Request memory to hold mem records to be returned during retrieve call */
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_BACKUP];
+ ps_mem_rec->u4_mem_size = MEM_REC_CNT * sizeof(iv_mem_rec_t);
+ DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BACKUP,
+ ps_mem_rec->u4_mem_size);
+
+ /* Each memtab size is aligned to next multiple of 128 bytes */
+ /* This is to ensure all the memtabs start at different cache lines */
+ ps_mem_rec = ps_mem_rec_base;
+ for(i = 0; i < MEM_REC_CNT; i++)
+ {
+ ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size);
+ ps_mem_rec++;
+ }
+ ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled = MEM_REC_CNT;
+#ifdef APPLY_CONCEALMENT
+ {
+ IV_API_CALL_STATUS_T status;
+ icncl_fill_mem_rec_ip_t cncl_fill_ip;
+ icncl_fill_mem_rec_op_t cncl_fill_op;
+ UWORD8 mem_loc = MEM_REC_CNT;
+
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.e_cmd = IV_CMD_FILL_NUM_MEM_REC;
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location = &(memTab[mem_loc]);
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_size = ps_mem_q_ip->s_ivd_fill_mem_rec_ip_t.u4_size;
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd = max_wd_luma;
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht = max_ht_luma;
+
+ status = icncl_api_function(NULL, (void *)&cncl_fill_ip, (void *)&cncl_fill_op);
+
+ if(IV_SUCCESS == status)
+ {
+ icncl_num_mem_rec_ip_t cncl_mem_ip;
+ icncl_num_mem_rec_op_t cncl_mem_op;
+
+ cncl_mem_ip.s_ivd_num_rec_ip_t.e_cmd = IV_CMD_GET_NUM_MEM_REC;
+ cncl_mem_ip.s_ivd_num_rec_ip_t.u4_size = sizeof(icncl_num_mem_rec_ip_t);
+
+ status = icncl_api_function(NULL, (void *)&cncl_mem_ip, (void *)&cncl_mem_op);
+ if(IV_SUCCESS == status)
+ {
+ ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled += cncl_mem_op.s_ivd_num_mem_rec_op_t.u4_num_mem_rec;
+ }
+ }
+
+ return status;
+
+ }
+#endif //APPLY_CONCEALMENT
+ DEBUG("Num mem recs in fill call : %d\n",
+ ps_mem_q_op->s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled);
+
+
+ return (status);
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Initializes from mem records passed to the codec
+*
+* @par Description:
+* Initializes pointers based on mem records passed
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_init_mem_rec(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+
+ ihevcd_cxa_init_ip_t *dec_init_ip;
+ ihevcd_cxa_init_op_t *dec_init_op;
+ WORD32 i;
+ iv_mem_rec_t *ps_mem_rec, *ps_mem_rec_base;
+ WORD32 status = IV_SUCCESS;
+ codec_t *ps_codec;
+ WORD32 max_tile_cols, max_tile_rows;
+
+ dec_init_ip = (ihevcd_cxa_init_ip_t *)pv_api_ip;
+ dec_init_op = (ihevcd_cxa_init_op_t *)pv_api_op;
+
+ ps_mem_rec_base = dec_init_ip->s_ivd_init_ip_t.pv_mem_rec_location;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_CODEC];
+ ps_codec_obj->pv_codec_handle = ps_mem_rec->pv_base;
+
+ ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+ /* Note this memset can not be done in init() call, since init will called
+ during reset as well. And calling this during reset will mean all pointers
+ need to reinitialized*/
+ memset(ps_codec, 0, sizeof(codec_t));
+
+ if(dec_init_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t, i4_level))
+ {
+ ps_codec->i4_init_level = dec_init_ip->i4_level;
+
+ ps_codec->i4_init_level *= 3;
+ }
+ else
+ {
+ ps_codec->i4_init_level = MAX_LEVEL;
+ }
+
+ if(dec_init_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t, u4_num_ref_frames))
+ {
+ ps_codec->i4_init_num_ref = dec_init_ip->u4_num_ref_frames;
+ }
+ else
+ {
+ ps_codec->i4_init_num_ref = MAX_REF_CNT;
+ }
+
+ if(dec_init_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t, u4_num_reorder_frames))
+ {
+ ps_codec->i4_init_num_reorder = dec_init_ip->u4_num_reorder_frames;
+ }
+ else
+ {
+ ps_codec->i4_init_num_reorder = MAX_REF_CNT;
+ }
+
+ if(dec_init_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t, u4_num_extra_disp_buf))
+ {
+ ps_codec->i4_init_num_extra_disp_buf =
+ dec_init_ip->u4_num_extra_disp_buf;
+ }
+ else
+ {
+ ps_codec->i4_init_num_extra_disp_buf = 0;
+ }
+
+ if(dec_init_ip->s_ivd_init_ip_t.u4_size
+ > offsetof(ihevcd_cxa_init_ip_t, u4_share_disp_buf))
+ {
+#ifndef LOGO_EN
+ ps_codec->i4_share_disp_buf = dec_init_ip->u4_share_disp_buf;
+#else
+ ps_codec->i4_share_disp_buf = 0;
+#endif
+ }
+ else
+ {
+ ps_codec->i4_share_disp_buf = 0;
+ }
+ /* Shared display mode is supported only for 420SP and 420P formats */
+ if((dec_init_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420P) &&
+ (dec_init_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420SP_UV) &&
+ (dec_init_ip->s_ivd_init_ip_t.e_output_format != IV_YUV_420SP_VU))
+ {
+ ps_codec->i4_share_disp_buf = 0;
+ }
+
+ if((ps_codec->i4_init_level < MIN_LEVEL)
+ || (ps_codec->i4_init_level > MAX_LEVEL))
+ {
+ dec_init_op->s_ivd_init_op_t.u4_error_code |= IHEVCD_LEVEL_UNSUPPORTED;
+ return (IV_FAIL);
+ }
+
+ if(ps_codec->i4_init_num_ref > MAX_REF_CNT)
+ {
+ dec_init_op->s_ivd_init_op_t.u4_error_code |=
+ IHEVCD_NUM_REF_UNSUPPORTED;
+ ps_codec->i4_init_num_ref = MAX_REF_CNT;
+ }
+
+ if(ps_codec->i4_init_num_reorder > MAX_REF_CNT)
+ {
+ dec_init_op->s_ivd_init_op_t.u4_error_code |=
+ IHEVCD_NUM_REORDER_UNSUPPORTED;
+ ps_codec->i4_init_num_reorder = MAX_REF_CNT;
+ }
+
+ if(ps_codec->i4_init_num_extra_disp_buf > MAX_REF_CNT)
+ {
+ dec_init_op->s_ivd_init_op_t.u4_error_code |=
+ IHEVCD_NUM_EXTRA_DISP_UNSUPPORTED;
+ ps_codec->i4_init_num_extra_disp_buf = 0;
+ }
+
+ ps_codec->e_chroma_fmt = dec_init_ip->s_ivd_init_ip_t.e_output_format;
+
+ ps_codec->i4_max_wd = dec_init_ip->s_ivd_init_ip_t.u4_frm_max_wd;
+ ps_codec->i4_max_ht = dec_init_ip->s_ivd_init_ip_t.u4_frm_max_ht;
+
+ ps_codec->i4_max_wd = ALIGN64(ps_codec->i4_max_wd);
+ ps_codec->i4_max_ht = ALIGN64(ps_codec->i4_max_ht);
+
+ max_tile_cols = (ps_codec->i4_max_wd + MIN_TILE_WD - 1) / MIN_TILE_WD;
+ max_tile_rows = (ps_codec->i4_max_ht + MIN_TILE_HT - 1) / MIN_TILE_HT;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_BACKUP];
+ ps_codec->ps_mem_rec_backup = (iv_mem_rec_t *)ps_mem_rec->pv_base;
+
+ memcpy(ps_codec->ps_mem_rec_backup, ps_mem_rec_base,
+ MEM_REC_CNT * sizeof(iv_mem_rec_t));
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_BITSBUF];
+ ps_codec->pu1_bitsbuf = (UWORD8 *)ps_mem_rec->pv_base;
+ ps_codec->u4_bitsbuf_size = ps_mem_rec->u4_mem_size;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TU_DATA];
+#ifdef GPU_BUILD
+ ps_codec->apv_tu_data[0] = ps_mem_rec->pv_base;
+ ps_codec->apv_tu_data[1] = (void *)((UWORD8 *)ps_codec->apv_tu_data[0] + (ps_mem_rec->u4_mem_size / 2));
+#else
+ ps_codec->pv_tu_data = ps_mem_rec->pv_base;
+#endif
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBANK];
+ ps_codec->pv_mv_buf_mgr = ps_mem_rec->pv_base;
+ ps_codec->pv_mv_bank_buf_base = (UWORD8 *)ps_codec->pv_mv_buf_mgr + sizeof(buf_mgr_t);
+
+ ps_codec->i4_total_mv_bank_size = ps_mem_rec->u4_mem_size - sizeof(buf_mgr_t);
+
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_VPS];
+ ps_codec->ps_vps_base = (vps_t *)ps_mem_rec->pv_base;
+ ps_codec->s_parse.ps_vps_base = ps_codec->ps_vps_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SPS];
+ ps_codec->ps_sps_base = (sps_t *)ps_mem_rec->pv_base;
+ ps_codec->s_parse.ps_sps_base = ps_codec->ps_sps_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PPS];
+ ps_codec->ps_pps_base = (pps_t *)ps_mem_rec->pv_base;
+ ps_codec->s_parse.ps_pps_base = ps_codec->ps_pps_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_HDR];
+#ifdef GPU_BUILD
+ ps_codec->aps_slice_hdr_base[0] = (slice_header_t *)ps_mem_rec->pv_base;
+ ps_codec->aps_slice_hdr_base[1] = (slice_header_t *)ps_mem_rec->pv_base + MAX_SLICE_HDR_CNT;
+#else
+ ps_codec->ps_slice_hdr_base = (slice_header_t *)ps_mem_rec->pv_base;
+ ps_codec->s_parse.ps_slice_hdr_base = ps_codec->ps_slice_hdr_base;
+#endif
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE];
+ ps_codec->ps_tile = (tile_t *)ps_mem_rec->pv_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTRY_OFST];
+ ps_codec->pi4_entry_ofst = (WORD32 *)ps_mem_rec->pv_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SCALING_MAT];
+ ps_codec->pi2_scaling_mat = (WORD16 *)ps_mem_rec->pv_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_SKIP_FLAG];
+ ps_codec->s_parse.pu4_skip_cu_top = (UWORD32 *)ps_mem_rec->pv_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_CT_DEPTH];
+ ps_codec->s_parse.pu4_ct_depth_top = (UWORD32 *)ps_mem_rec->pv_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_INTRA_PRED_MODE];
+ ps_codec->s_parse.pu1_luma_intra_pred_mode_left =
+ (UWORD8 *)ps_mem_rec->pv_base;
+ ps_codec->s_parse.pu1_luma_intra_pred_mode_top =
+ (UWORD8 *)ps_mem_rec->pv_base + 16;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_INTRA_FLAG];
+
+#ifdef GPU_BUILD
+ ps_codec->apu1_pic_intra_flag[0] = ps_mem_rec->pv_base;
+ ps_codec->apu1_pic_intra_flag[1] = ps_codec->apu1_pic_intra_flag[0] + (ps_mem_rec->u4_mem_size / 2);
+#else
+ memset(ps_mem_rec->pv_base, 0, (ps_codec->i4_max_wd / MIN_CU_SIZE) * (ps_codec->i4_max_ht / MIN_CU_SIZE) / 8);
+
+ ps_codec->pu1_pic_intra_flag = (UWORD8 *)ps_mem_rec->pv_base;
+ ps_codec->s_parse.pu1_pic_intra_flag = ps_codec->pu1_pic_intra_flag;
+#endif
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TRANSQUANT_BYPASS_FLAG];
+
+#ifdef GPU_BUILD
+ {
+ WORD32 loop_filter_strd = (ps_codec->i4_max_wd + 63) >> 6;
+
+ /* The offset is added for easy processing of top and left blocks while loop filtering */
+ ps_codec->apu1_pic_no_loop_filter_flag[0] = (UWORD8 *)ps_mem_rec->pv_base + loop_filter_strd + 1;
+ ps_codec->apu1_pic_no_loop_filter_flag[1] = (UWORD8 *)ps_mem_rec->pv_base + (ps_mem_rec->u4_mem_size / 2) + loop_filter_strd + 1;
+ }
+#else
+ {
+ WORD32 loop_filter_size = ((ps_codec->i4_max_wd + 64) / MIN_CU_SIZE) * ((ps_codec->i4_max_ht + 64) / MIN_CU_SIZE) / 8;
+ WORD32 loop_filter_strd = (ps_codec->i4_max_wd + 63) >> 6;
+
+ memset(ps_mem_rec->pv_base, 0, loop_filter_size);
+
+ /* The offset is added for easy processing of top and left blocks while loop filtering */
+ ps_codec->pu1_pic_no_loop_filter_flag = (UWORD8 *)ps_mem_rec->pv_base + loop_filter_strd + 1;
+ ps_codec->s_parse.pu1_pic_no_loop_filter_flag = ps_codec->pu1_pic_no_loop_filter_flag;
+ ps_codec->s_parse.s_deblk_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->pu1_pic_no_loop_filter_flag;
+ ps_codec->s_parse.s_sao_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->pu1_pic_no_loop_filter_flag;
+ }
+#endif
+
+ /* Initialize pointers in PPS structures */
+ {
+ sps_t *ps_sps = ps_codec->ps_sps_base;
+ pps_t *ps_pps = ps_codec->ps_pps_base;
+ tile_t *ps_tile = ps_codec->ps_tile;
+ WORD16 *pi2_scaling_mat = ps_codec->pi2_scaling_mat;
+ WORD32 scaling_mat_size;
+
+ SCALING_MAT_SIZE(scaling_mat_size);
+
+ for(i = 0; i < MAX_SPS_CNT; i++)
+ {
+ ps_sps->pi2_scaling_mat = pi2_scaling_mat;
+ pi2_scaling_mat += scaling_mat_size;
+ ps_sps++;
+ }
+
+ for(i = 0; i < MAX_PPS_CNT; i++)
+ {
+ ps_pps->ps_tile = ps_tile;
+ ps_tile += (max_tile_cols * max_tile_rows);
+
+ ps_pps->pi2_scaling_mat = pi2_scaling_mat;
+ pi2_scaling_mat += scaling_mat_size;
+ ps_pps++;
+ }
+
+ }
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_THREAD_HANDLE];
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ WORD32 handle_size = ithread_get_handle_size();
+ ps_codec->apv_process_thread_handle[i] =
+ (UWORD8 *)ps_mem_rec->pv_base + (i * handle_size);
+ }
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_JOBQ];
+ ps_codec->pv_proc_jobq_buf = ps_mem_rec->pv_base;
+ ps_codec->i4_proc_jobq_buf_size = ps_mem_rec->u4_mem_size;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PARSE_MAP];
+ ps_codec->pu1_parse_map = (UWORD8 *)ps_mem_rec->pv_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_MAP];
+#ifdef GPU_BUILD
+ memset(ps_mem_rec->pv_base, 0, ps_mem_rec->u4_mem_size);
+ ps_codec->apu1_proc_map[0] = (UWORD8 *)ps_mem_rec->pv_base;
+ ps_codec->apu1_proc_map[1] = (UWORD8 *)ps_mem_rec->pv_base + (ps_mem_rec->u4_mem_size / 2);
+#else
+ ps_codec->pu1_proc_map = (UWORD8 *)ps_mem_rec->pv_base;
+#endif
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_DISP_MGR];
+ ps_codec->pv_disp_buf_mgr = ps_mem_rec->pv_base;
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_DPB_MGR];
+ ps_codec->pv_dpb_mgr = ps_mem_rec->pv_base;
+
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PIC_PU_IDX_NEIGHBOR];
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ UWORD32 *pu4_buf = (UWORD32 *)ps_mem_rec->pv_base;
+ ps_codec->as_process[i].pu4_pic_pu_idx_left = pu4_buf + i * (MAX_CTB_SIZE / 4);
+ memset(ps_codec->as_process[i].pu4_pic_pu_idx_left, 0, sizeof(UWORD32) * MAX_CTB_SIZE / 4);
+ ps_codec->as_process[i].pu4_pic_pu_idx_top = pu4_buf + MAX_PROCESS_THREADS * (MAX_CTB_SIZE / 4);
+ }
+ memset(ps_codec->as_process[0].pu4_pic_pu_idx_top, 0, sizeof(UWORD32) * (ps_codec->i4_max_wd / 4 + 1));
+
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_SCRATCH];
+ {
+ UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+ WORD32 pic_pu_idx_map_size;
+
+ WORD32 inter_pred_tmp_buf_size, ntaps_luma;
+
+ /* Max inter pred size */
+ ntaps_luma = 8;
+ inter_pred_tmp_buf_size = sizeof(WORD16) * (MAX_CTB_SIZE + ntaps_luma) * MAX_CTB_SIZE;
+
+ inter_pred_tmp_buf_size = ALIGN64(inter_pred_tmp_buf_size);
+
+ /* To hold pu_index w.r.t. frame level pu_t array for a CTB */
+ pic_pu_idx_map_size = sizeof(WORD32) * (18 * 18);
+ pic_pu_idx_map_size = ALIGN64(pic_pu_idx_map_size);
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].pi2_inter_pred_tmp_buf1 = (WORD16 *)pu1_buf;
+ pu1_buf += inter_pred_tmp_buf_size;
+
+ ps_codec->as_process[i].pi2_inter_pred_tmp_buf2 = (WORD16 *)pu1_buf;
+ pu1_buf += inter_pred_tmp_buf_size;
+
+ /* Inverse transform intermediate and inverse scan output buffers reuse inter pred scratch buffers */
+ ps_codec->as_process[i].pi2_itrans_intrmd_buf =
+ ps_codec->as_process[i].pi2_inter_pred_tmp_buf2;
+ ps_codec->as_process[i].pi2_invscan_out =
+ ps_codec->as_process[i].pi2_inter_pred_tmp_buf1;
+
+ ps_codec->as_process[i].pu4_pic_pu_idx_map = (UWORD32 *)pu1_buf;
+ ps_codec->as_process[i].s_bs_ctxt.pu4_pic_pu_idx_map =
+ (UWORD32 *)pu1_buf;
+ pu1_buf += pic_pu_idx_map_size;
+
+ // ps_codec->as_process[i].pi2_inter_pred_tmp_buf3 = (WORD16 *)pu1_buf;
+ // pu1_buf += inter_pred_tmp_buf_size;
+
+ ps_codec->as_process[i].i4_inter_pred_tmp_buf_strd = MAX_CTB_SIZE;
+
+ }
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_left_luma = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_left_luma = (UWORD8 *)pu1_buf;
+ pu1_buf += MAX(ps_codec->i4_max_ht, ps_codec->i4_max_wd);
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_left_chroma = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_left_chroma = (UWORD8 *)pu1_buf;
+ pu1_buf += MAX(ps_codec->i4_max_ht, ps_codec->i4_max_wd);
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_luma = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_luma = (UWORD8 *)pu1_buf;
+ pu1_buf += ps_codec->i4_max_wd;
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_chroma = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_chroma = (UWORD8 *)pu1_buf;
+ pu1_buf += ps_codec->i4_max_wd;
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_luma_top_left_ctb = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_luma_top_left_ctb = (UWORD8 *)pu1_buf;
+ pu1_buf += ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_chroma_top_left_ctb = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_chroma_top_left_ctb = (UWORD8 *)pu1_buf;
+ pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE) * 2;
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_luma_curr_ctb = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_luma_curr_ctb = (UWORD8 *)pu1_buf;
+ pu1_buf += ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_chroma_curr_ctb = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_chroma_curr_ctb = (UWORD8 *)pu1_buf;
+
+ pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE) * 2;
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_luma_top_right = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_luma_top_right = (UWORD8 *)pu1_buf;
+
+ pu1_buf += ps_codec->i4_max_wd / MIN_CTB_SIZE;
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_chroma_top_right = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_chroma_top_right = (UWORD8 *)pu1_buf;
+
+ pu1_buf += (ps_codec->i4_max_wd / MIN_CTB_SIZE) * 2;
+
+ /*Per CTB, Store 1 value for luma , 2 values for chroma*/
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_luma_bot_left = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_luma_bot_left = (UWORD8 *)pu1_buf;
+
+ pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE);
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_sao_src_top_left_chroma_bot_left = (UWORD8 *)pu1_buf;
+ }
+ ps_codec->s_parse.s_sao_ctxt.pu1_sao_src_top_left_chroma_bot_left = (UWORD8 *)pu1_buf;
+
+ pu1_buf += (ps_codec->i4_max_ht / MIN_CTB_SIZE) * 2;
+ }
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO_SCRATCH];
+ {
+ UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.pu1_tmp_buf_luma = (UWORD8 *)pu1_buf;
+ pu1_buf += 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8);
+
+ ps_codec->as_process[i].s_sao_ctxt.pu1_tmp_buf_chroma = (UWORD8 *)pu1_buf;
+ pu1_buf += 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8);
+ }
+ }
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_BS_QP];
+ {
+ UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+ WORD32 vert_bs_size, horz_bs_size;
+ WORD32 qp_const_flag_size;
+ WORD32 qp_size;
+ WORD32 num_8x8;
+
+ /* Max Number of vertical edges */
+ vert_bs_size = ps_codec->i4_max_wd / 8 + MAX_CTB_SIZE / 8;
+
+ /* Max Number of horizontal edges - extra MAX_CTB_SIZE / 8 to handle the last 4 rows separately(shifted CTB processing) */
+ vert_bs_size *= (ps_codec->i4_max_ht + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+ /* Number of bytes */
+ vert_bs_size /= 8;
+
+ /* Two bits per edge */
+ vert_bs_size *= 2;
+
+ /* Max Number of horizontal edges */
+ horz_bs_size = ps_codec->i4_max_ht / 8 + MAX_CTB_SIZE / 8;
+
+ /* Max Number of vertical edges - extra MAX_CTB_SIZE / 8 to handle the last 4 columns separately(shifted CTB processing) */
+ horz_bs_size *= (ps_codec->i4_max_wd + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+ /* Number of bytes */
+ horz_bs_size /= 8;
+
+ /* Two bits per edge */
+ horz_bs_size *= 2;
+
+ /* Max CTBs in a row */
+ qp_const_flag_size = ps_codec->i4_max_wd / MIN_CTB_SIZE + 1 /* The last ctb row deblk is done in last ctb + 1 row.*/;
+
+ /* Max CTBs in a column */
+ qp_const_flag_size *= ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+ /* Number of bytes */
+ qp_const_flag_size /= 8;
+
+ /* QP changes at CU level - So store at 8x8 level */
+ num_8x8 = (ps_codec->i4_max_ht * ps_codec->i4_max_wd) / (MIN_CU_SIZE * MIN_CU_SIZE);
+ qp_size = num_8x8;
+#ifndef GPU_BUILD
+ memset(pu1_buf, 0, vert_bs_size + horz_bs_size + qp_size + qp_const_flag_size);
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+ pu1_buf += vert_bs_size;
+
+ ps_codec->as_process[i].s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+ pu1_buf += horz_bs_size;
+
+ ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_size;
+
+ ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_const_flag_size;
+
+ pu1_buf -= (vert_bs_size + horz_bs_size + qp_size + qp_const_flag_size);
+ }
+#endif
+#ifdef GPU_BUILD
+ ps_codec->apu4_pic_vert_bs[0] = (UWORD32 *)pu1_buf;
+ pu1_buf += vert_bs_size;
+
+ ps_codec->apu4_pic_horz_bs[0] = (UWORD32 *)pu1_buf;
+ pu1_buf += horz_bs_size;
+
+ ps_codec->apu1_pic_qp[0] = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_size;
+
+ ps_codec->apu1_pic_qp_const_in_ctb[0] = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_const_flag_size;
+
+ ps_codec->apu4_pic_vert_bs[1] = (UWORD32 *)pu1_buf;
+ pu1_buf += vert_bs_size;
+
+ ps_codec->apu4_pic_horz_bs[1] = (UWORD32 *)pu1_buf;
+ pu1_buf += horz_bs_size;
+
+ ps_codec->apu1_pic_qp[1] = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_size;
+
+ ps_codec->apu1_pic_qp_const_in_ctb[1] = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_const_flag_size;
+#else
+ ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)pu1_buf;
+ pu1_buf += vert_bs_size;
+
+ ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)pu1_buf;
+ pu1_buf += horz_bs_size;
+
+ ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_size;
+
+ ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)pu1_buf;
+ pu1_buf += qp_const_flag_size;
+#endif
+
+ }
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_TILE_IDX];
+ {
+#ifdef GPU_BUILD
+ UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+ for(i = 0; i < 2; i++)
+ {
+ ps_codec->as_process[i].pu1_tile_idx = (UWORD16 *)pu1_buf;
+ }
+
+ pu1_buf += ps_mem_rec->u4_mem_size / 2;
+
+ for(i = 2; i < 4; i++)
+ {
+ ps_codec->as_process[i].pu1_tile_idx = (UWORD16 *)pu1_buf;
+ }
+#else
+ UWORD8 *pu1_buf = (UWORD8 *)ps_mem_rec->pv_base;
+
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].pu1_tile_idx = (UWORD16 *)pu1_buf + ps_codec->i4_max_wd / MIN_CTB_SIZE /* Offset 1 row */;
+ }
+#endif
+ }
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_SAO];
+#ifdef GPU_BUILD
+ memset(ps_mem_rec->pv_base, 0, ps_mem_rec->u4_mem_size);
+ ps_codec->aps_pic_sao[0] = (sao_t *)ps_mem_rec->pv_base;
+ ps_codec->aps_pic_sao[1] = (sao_t *)((UWORD32)ps_mem_rec->pv_base + ps_mem_rec->u4_mem_size / 2);
+#else
+ ps_codec->s_parse.ps_pic_sao = (sao_t *)ps_mem_rec->pv_base;
+ ps_codec->s_parse.s_sao_ctxt.ps_pic_sao = (sao_t *)ps_mem_rec->pv_base;
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].s_sao_ctxt.ps_pic_sao = ps_codec->s_parse.ps_pic_sao;
+ }
+#endif
+#ifdef GPU_BUILD
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_GPU];
+ ps_codec->s_gpu_ctxt.pv_gpu_priv = ps_mem_rec->pv_base;
+
+
+ /* Initialize opencl device */
+ /* Call GPU init before codec init so that reference frame buf could be allocated */
+ status = ihevcd_gpu_mc_init(ps_codec);
+ RETURN_IF((status == IV_FAIL), IV_FAIL);
+#endif
+
+ ps_mem_rec = &ps_mem_rec_base[MEM_REC_REF_PIC];
+#if defined(GPU_BUILD) && !defined(FRAME_STAGGER_ONLY)
+ ps_codec->pv_pic_buf_mgr = ihevcd_gpu_alloc_ref_buf(ps_codec,
+ ps_mem_rec->u4_mem_alignment,
+ ps_mem_rec->u4_mem_size);
+ RETURN_IF((ps_codec->pv_pic_buf_mgr == NULL), IV_FAIL);
+#else
+ ps_codec->pv_pic_buf_mgr = ps_mem_rec->pv_base;
+#endif
+ ps_codec->pv_pic_buf_base = (UWORD8 *)ps_codec->pv_pic_buf_mgr + sizeof(buf_mgr_t);
+ ps_codec->i4_total_pic_buf_size = ps_mem_rec->u4_mem_size - sizeof(buf_mgr_t);
+
+
+
+
+
+#ifdef APPLY_CONCEALMENT
+ {
+
+ UWORD32 mem_loc;
+
+ icncl_init_ip_t cncl_init_ip;
+ icncl_init_op_t cncl_init_op;
+ iv_mem_rec_t *ps_mem_rec;
+ DecStruct *ps_codec;
+
+ ps_mem_rec = dec_init_ip->s_ivd_init_ip_t.pv_mem_rec_location;
+ mem_loc = MEM_REC_CNT;
+
+ ps_codec->ps_conceal = (iv_obj_t *)ps_mem_rec[mem_loc].pv_base;
+ ps_codec->i4_first_frame_done = 0;
+
+ cncl_init_ip.u4_size = sizeof(icncl_init_ip_t);
+ cncl_init_ip.pv_mem_rec_location = &(ps_mem_rec[mem_loc]);
+ cncl_init_ip.e_cmd = IV_CMD_INIT;
+
+ status = icncl_api_function(ps_codec->ps_conceal, (void *)&cncl_init_ip, (void *)&cncl_init_op);
+
+ }
+#endif //APPLY_CONCEALMENT
+
+ status = ihevcd_init(ps_codec);
+
+ TRACE_INIT(NULL);
+ STATS_INIT();
+ return status;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Retrieves mem records passed to the codec
+*
+* @par Description:
+* Retrieves memrecs passed earlier
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_retrieve_memrec(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+
+ iv_retrieve_mem_rec_ip_t *dec_clr_ip;
+ iv_retrieve_mem_rec_op_t *dec_clr_op;
+ codec_t *ps_codec;
+ dec_clr_ip = (iv_retrieve_mem_rec_ip_t *)pv_api_ip;
+ dec_clr_op = (iv_retrieve_mem_rec_op_t *)pv_api_op;
+ ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+#ifdef GPU_BUILD
+ ihevcd_gpu_mc_deinit(&ps_codec->s_gpu_ctxt);
+
+#endif
+
+ if(ps_codec->i4_init_done != 1)
+ {
+ dec_clr_op->u4_error_code |= 1 << IVD_FATALERROR;
+ dec_clr_op->u4_error_code |= IHEVCD_INIT_NOT_DONE;
+ return IV_FAIL;
+ }
+
+ memcpy(dec_clr_ip->pv_mem_rec_location, ps_codec->ps_mem_rec_backup,
+ MEM_REC_CNT * (sizeof(iv_mem_rec_t)));
+ dec_clr_op->u4_num_mem_rec_filled = MEM_REC_CNT;
+
+#ifdef APPLY_CONCEALMENT
+ {
+ IV_API_CALL_STATUS_T status;
+ icncl_fill_mem_rec_ip_t cncl_fill_ip;
+ icncl_fill_mem_rec_op_t cncl_fill_op;
+
+ iv_mem_rec_t *ps_mem_rec;
+
+ UWORD8 mem_loc = MEM_REC_CNT;
+ UWORD8 num_cncl_mem = 0;
+
+ ps_mem_rec = dec_clr_ip->pv_mem_rec_location;
+
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.e_cmd = IV_CMD_FILL_NUM_MEM_REC;
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location = &(ps_mem_rec[mem_loc]);
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.u4_size = sizeof(icncl_fill_mem_rec_ip_t);
+
+ status = icncl_api_function(NULL, (void *)&cncl_fill_ip, (void *)&cncl_fill_op);
+
+ cncl_fill_ip.s_ivd_fill_mem_rec_ip_t.e_cmd = IV_CMD_RETRIEVE_MEMREC;
+ cncl_fill_op.s_ivd_fill_mem_rec_op_t.u4_size = sizeof(icncl_fill_mem_rec_op_t);
+
+ status = icncl_api_function(ps_codec->ps_conceal, (void *)&cncl_fill_ip, (void *)&cncl_fill_op);
+
+ if(status == IV_SUCCESS)
+ {
+ /* Add the concealment library's memory requirements */
+ dec_clr_op->u4_num_mem_rec_filled += cncl_fill_op.s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled;
+ }
+ }
+#endif //APPLY_CONCEALMENT
+ DEBUG("Retrieve num mem recs: %d\n",
+ dec_clr_op->u4_num_mem_rec_filled);
+ STATS_PRINT();
+ ihevcd_jobq_free((jobq_t *)ps_codec->pv_proc_jobq);
+
+
+
+ return IV_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Passes display buffer from application to codec
+*
+* @par Description:
+* Adds display buffer to the codec
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_display_frame(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ WORD32 ret = IV_SUCCESS;
+
+ ivd_set_display_frame_ip_t *ps_dec_disp_ip;
+ ivd_set_display_frame_op_t *ps_dec_disp_op;
+
+ WORD32 i;
+
+ codec_t *ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+ ps_dec_disp_ip = (ivd_set_display_frame_ip_t *)pv_api_ip;
+ ps_dec_disp_op = (ivd_set_display_frame_op_t *)pv_api_op;
+
+ ps_codec->i4_num_disp_bufs = 0;
+ if(ps_codec->i4_share_disp_buf)
+ {
+ UWORD32 num_bufs = ps_dec_disp_ip->num_disp_bufs;
+ pic_buf_t *ps_pic_buf;
+ UWORD8 *pu1_buf;
+ WORD32 buf_ret;
+ WORD32 strd;
+ strd = ps_codec->i4_strd;
+ if(0 == strd)
+ strd = ps_codec->i4_max_wd + PAD_WD;
+ num_bufs = MIN(num_bufs, BUF_MGR_MAX_CNT);
+ ps_codec->i4_num_disp_bufs = num_bufs;
+
+ ps_pic_buf = (pic_buf_t *)ps_codec->ps_pic_buf;
+ for(i = 0; i < (WORD32)num_bufs; i++)
+ {
+ pu1_buf = ps_dec_disp_ip->s_disp_buffer[i].pu1_bufs[0];
+ ps_pic_buf->pu1_luma = pu1_buf + strd * PAD_TOP + PAD_LEFT;
+
+ pu1_buf = ps_dec_disp_ip->s_disp_buffer[i].pu1_bufs[1];
+ ps_pic_buf->pu1_chroma = pu1_buf + strd * (PAD_TOP / 2) + PAD_LEFT;
+
+ buf_ret = ihevc_buf_mgr_add((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf, i);
+
+ if(0 != buf_ret)
+ {
+ ps_codec->i4_error_code = IHEVCD_BUF_MGR_ERROR;
+ return IHEVCD_BUF_MGR_ERROR;
+ }
+
+ /* Mark pic buf as needed for display */
+ /* This ensures that till the buffer is explicitly passed to the codec,
+ * application owns the buffer. Decoder is allowed to use a buffer only
+ * when application sends it through fill this buffer call in OMX
+ */
+ ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, i, BUF_MGR_DISP);
+
+ ps_pic_buf++;
+
+ }
+ }
+
+ ps_dec_disp_op->u4_error_code = 0;
+ return ret;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets the decoder in flush mode. Decoder will come out of flush only
+* after returning all the buffers or at reset
+*
+* @par Description:
+* Sets the decoder in flush mode
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_flush_mode(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+
+ codec_t *ps_codec;
+ ivd_ctl_flush_op_t *ps_ctl_op = (ivd_ctl_flush_op_t *)pv_api_op;
+ UNUSED(pv_api_ip);
+ ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+ /* Signal flush frame control call */
+ ps_codec->i4_flush_mode = 1;
+
+ ps_ctl_op->u4_error_code = 0;
+
+ /* Set pic count to zero, so that decoder starts buffering again */
+ /* once it comes out of flush mode */
+ ps_codec->u4_pic_cnt = 0;
+ ps_codec->u4_disp_cnt = 0;
+ return IV_SUCCESS;
+
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets decoder status and buffer requirements
+*
+* @par Description:
+* Gets the decoder status
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_get_status(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+
+ WORD32 i;
+ codec_t *ps_codec;
+ WORD32 wd, ht;
+ ivd_ctl_getstatus_op_t *ps_ctl_op = (ivd_ctl_getstatus_op_t *)pv_api_op;
+
+ UNUSED(pv_api_ip);
+
+ ps_ctl_op->u4_error_code = 0;
+
+ ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+ ps_ctl_op->u4_min_num_in_bufs = MIN_IN_BUFS;
+ if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420;
+ else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_422ILE;
+ else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGB565;
+ else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGBA8888;
+ else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+ || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420SP;
+
+ ps_ctl_op->u4_num_disp_bufs = 1;
+
+ for(i = 0; i < (WORD32)ps_ctl_op->u4_min_num_in_bufs; i++)
+ {
+ ps_ctl_op->u4_min_in_buf_size[i] = MAX((ps_codec->i4_wd * ps_codec->i4_ht), MIN_BITSBUF_SIZE);
+ }
+
+ wd = ps_codec->i4_wd;
+ ht = ps_codec->i4_ht;
+
+ if(ps_codec->i4_sps_done)
+ {
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ wd = ps_codec->i4_disp_wd;
+ ht = ps_codec->i4_disp_ht;
+
+ }
+ else
+ {
+ wd = ps_codec->i4_disp_strd;
+ ht = ps_codec->i4_ht + PAD_HT;
+ }
+ }
+ else
+ {
+ if(1 == ps_codec->i4_share_disp_buf)
+ {
+ wd = ALIGN32(wd + PAD_WD);
+ ht += PAD_HT;
+ }
+ }
+
+ if(ps_codec->i4_disp_strd > wd)
+ wd = ps_codec->i4_disp_strd;
+
+ if(0 == ps_codec->i4_share_disp_buf)
+ ps_ctl_op->u4_num_disp_bufs = 1;
+ else
+ {
+ WORD32 pic_size;
+ WORD32 max_dpb_size;
+
+ if(ps_codec->i4_sps_done)
+ {
+ sps_t *ps_sps = (ps_codec->s_parse.ps_sps_base + ps_codec->i4_sps_id);
+ WORD32 reorder_pic_cnt;
+ WORD32 ref_pic_cnt;
+ WORD32 level;
+
+ reorder_pic_cnt = MIN(ps_sps->ai1_sps_max_num_reorder_pics[0], ps_codec->i4_init_num_reorder);
+ pic_size = ps_sps->i2_pic_width_in_luma_samples * ps_sps->i2_pic_height_in_luma_samples;
+
+ level = ps_codec->i4_init_level;
+ max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+ ref_pic_cnt = max_dpb_size;
+ ps_ctl_op->u4_num_disp_bufs = reorder_pic_cnt;
+
+ ps_ctl_op->u4_num_disp_bufs += ref_pic_cnt + 1;
+
+ }
+ else
+ {
+ pic_size = ps_codec->i4_max_wd * ps_codec->i4_max_ht;
+ max_dpb_size = ihevcd_get_dpb_size(ps_codec->i4_init_level, pic_size);
+ ps_ctl_op->u4_num_disp_bufs = 2 * max_dpb_size;
+
+ ps_ctl_op->u4_num_disp_bufs = MIN(ps_ctl_op->u4_num_disp_bufs,
+ (ps_codec->i4_init_num_ref + ps_codec->i4_init_num_reorder + 1));
+
+ }
+
+ ps_ctl_op->u4_num_disp_bufs = MIN(
+ ps_ctl_op->u4_num_disp_bufs, 32);
+ }
+
+ /*!*/
+ if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+ ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 2;
+ ps_ctl_op->u4_min_out_buf_size[2] = (wd * ht) >> 2;
+ }
+ else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+ ps_ctl_op->u4_min_out_buf_size[1] =
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+ ps_ctl_op->u4_min_out_buf_size[1] =
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 4;
+ ps_ctl_op->u4_min_out_buf_size[1] =
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+ || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+ ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 1;
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ ps_ctl_op->u4_pic_ht = ht;
+ ps_ctl_op->u4_pic_wd = wd;
+ ps_ctl_op->u4_frame_rate = 30000;
+ ps_ctl_op->u4_bit_rate = 1000000;
+ ps_ctl_op->e_content_type = IV_PROGRESSIVE;
+ ps_ctl_op->e_output_chroma_format = ps_codec->e_chroma_fmt;
+ ps_codec->i4_num_disp_bufs = ps_ctl_op->u4_num_disp_bufs;
+ return IV_SUCCESS;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Gets decoder buffer requirements
+*
+* @par Description:
+* Gets the decoder buffer requirements. If called before header decoder,
+* buffer requirements are based on max_wd and max_ht else actual width and
+* height will be used
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_buf_info(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+
+ codec_t *ps_codec;
+ UWORD32 i = 0;
+ WORD32 wd, ht;
+ ivd_ctl_getbufinfo_op_t *ps_ctl_op =
+ (ivd_ctl_getbufinfo_op_t *)pv_api_op;
+
+ UNUSED(pv_api_ip);
+ ps_ctl_op->u4_error_code = 0;
+
+ ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+ ps_ctl_op->u4_min_num_in_bufs = MIN_IN_BUFS;
+ if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420;
+ else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_422ILE;
+ else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGB565;
+ else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_RGBA8888;
+ else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+ || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+ ps_ctl_op->u4_min_num_out_bufs = MIN_OUT_BUFS_420SP;
+
+ ps_ctl_op->u4_num_disp_bufs = 1;
+
+ for(i = 0; i < ps_ctl_op->u4_min_num_in_bufs; i++)
+ {
+ ps_ctl_op->u4_min_in_buf_size[i] = MAX((ps_codec->i4_wd * ps_codec->i4_ht), MIN_BITSBUF_SIZE);
+ }
+
+ wd = ps_codec->i4_max_wd;
+ ht = ps_codec->i4_max_ht;
+
+ if(ps_codec->i4_sps_done)
+ {
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ wd = ps_codec->i4_disp_wd;
+ ht = ps_codec->i4_disp_ht;
+
+ }
+ else
+ {
+ wd = ps_codec->i4_disp_strd;
+ ht = ps_codec->i4_ht + PAD_HT;
+ }
+ }
+ else
+ {
+ if(1 == ps_codec->i4_share_disp_buf)
+ {
+ wd = ALIGN32(wd + PAD_WD);
+ ht += PAD_HT;
+ }
+ }
+
+ if(ps_codec->i4_disp_strd > wd)
+ wd = ps_codec->i4_disp_strd;
+
+ if(0 == ps_codec->i4_share_disp_buf)
+ ps_ctl_op->u4_num_disp_bufs = 1;
+ else
+ {
+ WORD32 pic_size;
+ WORD32 max_dpb_size;
+
+ if(ps_codec->i4_sps_done)
+ {
+ sps_t *ps_sps = (ps_codec->s_parse.ps_sps_base + ps_codec->i4_sps_id);
+ WORD32 reorder_pic_cnt;
+ WORD32 ref_pic_cnt;
+ WORD32 level;
+
+ reorder_pic_cnt = MIN(ps_sps->ai1_sps_max_num_reorder_pics[0], ps_codec->i4_init_num_reorder);
+ pic_size = ps_sps->i2_pic_width_in_luma_samples * ps_sps->i2_pic_height_in_luma_samples;
+
+ level = ps_codec->i4_init_level;
+ max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+ ref_pic_cnt = max_dpb_size;
+ ps_ctl_op->u4_num_disp_bufs = reorder_pic_cnt;
+
+ ps_ctl_op->u4_num_disp_bufs += ref_pic_cnt + 1;
+
+ }
+ else
+ {
+ pic_size = ps_codec->i4_max_wd * ps_codec->i4_max_ht;
+ max_dpb_size = ihevcd_get_dpb_size(ps_codec->i4_init_level, pic_size);
+ ps_ctl_op->u4_num_disp_bufs = 2 * max_dpb_size;
+
+ ps_ctl_op->u4_num_disp_bufs = MIN(ps_ctl_op->u4_num_disp_bufs,
+ (ps_codec->i4_init_num_ref + ps_codec->i4_init_num_reorder + 1));
+
+ }
+
+ ps_ctl_op->u4_num_disp_bufs = MIN(
+ ps_ctl_op->u4_num_disp_bufs, 32);
+
+ }
+
+ /*!*/
+ if(ps_codec->e_chroma_fmt == IV_YUV_420P)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+ ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 2;
+ ps_ctl_op->u4_min_out_buf_size[2] = (wd * ht) >> 2;
+ }
+ else if(ps_codec->e_chroma_fmt == IV_YUV_422ILE)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+ ps_ctl_op->u4_min_out_buf_size[1] =
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ else if(ps_codec->e_chroma_fmt == IV_RGB_565)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 2;
+ ps_ctl_op->u4_min_out_buf_size[1] =
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ else if(ps_codec->e_chroma_fmt == IV_RGBA_8888)
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht) * 4;
+ ps_ctl_op->u4_min_out_buf_size[1] =
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ else if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+ || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+ {
+ ps_ctl_op->u4_min_out_buf_size[0] = (wd * ht);
+ ps_ctl_op->u4_min_out_buf_size[1] = (wd * ht) >> 1;
+ ps_ctl_op->u4_min_out_buf_size[2] = 0;
+ }
+ ps_codec->i4_num_disp_bufs = ps_ctl_op->u4_num_disp_bufs;
+
+ return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets dynamic parameters
+*
+* @par Description:
+* Sets dynamic parameters. Note Frame skip, decode header mode are dynamic
+* Dynamic change in stride is not supported
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_set_params(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+
+ codec_t *ps_codec;
+ WORD32 ret = IV_SUCCESS;
+ WORD32 strd;
+ ivd_ctl_set_config_ip_t *s_ctl_dynparams_ip =
+ (ivd_ctl_set_config_ip_t *)pv_api_ip;
+ ivd_ctl_set_config_op_t *s_ctl_dynparams_op =
+ (ivd_ctl_set_config_op_t *)pv_api_op;
+
+ ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+ s_ctl_dynparams_op->u4_error_code = 0;
+
+ ps_codec->e_pic_skip_mode = s_ctl_dynparams_ip->e_frm_skip_mode;
+
+ if(s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_NONE)
+ {
+
+ if((s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_P) &&
+ (s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_B) &&
+ (s_ctl_dynparams_ip->e_frm_skip_mode != IVD_SKIP_PB))
+ {
+ s_ctl_dynparams_op->u4_error_code = (1 << IVD_UNSUPPORTEDPARAM);
+ ret = IV_FAIL;
+ }
+ }
+
+ strd = ps_codec->i4_disp_strd;
+ if(1 == ps_codec->i4_share_disp_buf)
+ {
+ strd = ps_codec->i4_strd;
+ }
+
+
+ if((-1 != (WORD32)s_ctl_dynparams_ip->u4_disp_wd) &&
+ (0 != s_ctl_dynparams_ip->u4_disp_wd) &&
+ (0 != strd) &&
+ ((WORD32)s_ctl_dynparams_ip->u4_disp_wd < strd))
+ {
+ s_ctl_dynparams_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
+ s_ctl_dynparams_op->u4_error_code |= IHEVCD_INVALID_DISP_STRD;
+ ret = IV_FAIL;
+ }
+ else
+ {
+ if((WORD32)s_ctl_dynparams_ip->u4_disp_wd >= ps_codec->i4_wd)
+ {
+ strd = s_ctl_dynparams_ip->u4_disp_wd;
+ }
+ else if(0 == ps_codec->i4_sps_done ||
+ 0 == ps_codec->i4_pps_done)
+ {
+ strd = s_ctl_dynparams_ip->u4_disp_wd;
+ }
+ else if(s_ctl_dynparams_ip->u4_disp_wd == 0)
+ {
+ strd = ps_codec->i4_disp_strd;
+ }
+ else
+ {
+ strd = 0;
+ s_ctl_dynparams_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
+ s_ctl_dynparams_op->u4_error_code |= IHEVCD_INVALID_DISP_STRD;
+ ret = IV_FAIL;
+ }
+ }
+
+ ps_codec->i4_disp_strd = strd;
+ if(1 == ps_codec->i4_share_disp_buf)
+ {
+ ps_codec->i4_strd = strd;
+ }
+
+ if(s_ctl_dynparams_ip->e_vid_dec_mode == IVD_DECODE_FRAME)
+ ps_codec->i4_header_mode = 0;
+ else if(s_ctl_dynparams_ip->e_vid_dec_mode == IVD_DECODE_HEADER)
+ ps_codec->i4_header_mode = 1;
+ else
+ {
+
+ s_ctl_dynparams_op->u4_error_code = (1 << IVD_UNSUPPORTEDPARAM);
+ ps_codec->i4_header_mode = 1;
+ ret = IV_FAIL;
+ }
+
+
+ return ret;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Resets the decoder state
+*
+* @par Description:
+* Resets the decoder state by calling ihevcd_init()
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_reset(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
+{
+ codec_t *ps_codec;
+ ivd_ctl_reset_op_t *s_ctl_reset_op = (ivd_ctl_reset_op_t *)pv_api_op;
+ UNUSED(pv_api_ip);
+ ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+
+ if(ps_codec != NULL)
+ {
+ DEBUG("\nReset called \n");
+ ihevcd_init(ps_codec);
+ }
+ else
+ {
+ DEBUG("\nReset called without Initializing the decoder\n");
+ s_ctl_reset_op->u4_error_code = IHEVCD_INIT_NOT_DONE;
+ }
+
+ return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Releases display buffer from application to codec to signal to the codec
+* that it can write to this buffer if required. Till release is called,
+* codec can not write to this buffer
+*
+* @par Description:
+* Marks the buffer as display done
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_rel_display_frame(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+
+ ivd_rel_display_frame_ip_t *ps_dec_rel_disp_ip;
+ ivd_rel_display_frame_op_t *ps_dec_rel_disp_op;
+
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+ ps_dec_rel_disp_ip = (ivd_rel_display_frame_ip_t *)pv_api_ip;
+ ps_dec_rel_disp_op = (ivd_rel_display_frame_op_t *)pv_api_op;
+
+ UNUSED(ps_dec_rel_disp_op);
+
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ return IV_SUCCESS;
+ }
+
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_dec_rel_disp_ip->u4_disp_buf_id, BUF_MGR_DISP);
+
+ return IV_SUCCESS;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Sets degrade params
+*
+* @par Description:
+* Sets degrade params.
+* Refer to ihevcd_cxa_ctl_degrade_ip_t definition for details
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_set_degrade(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ ihevcd_cxa_ctl_degrade_ip_t *ps_ip;
+ ihevcd_cxa_ctl_degrade_op_t *ps_op;
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+ ps_ip = (ihevcd_cxa_ctl_degrade_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_degrade_op_t *)pv_api_op;
+
+ ps_codec->i4_degrade_type = ps_ip->i4_degrade_type;
+ ps_codec->i4_nondegrade_interval = ps_ip->i4_nondegrade_interval;
+ ps_codec->i4_degrade_pics = ps_ip->i4_degrade_pics;
+
+ ps_op->u4_error_code = 0;
+ ps_codec->i4_degrade_pic_cnt = 0;
+
+ return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets frame dimensions/offsets
+*
+* @par Description:
+* Gets frame buffer chararacteristics such a x & y offsets display and
+* buffer dimensions
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_get_frame_dimensions(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ ihevcd_cxa_ctl_get_frame_dimensions_ip_t *ps_ip;
+ ihevcd_cxa_ctl_get_frame_dimensions_op_t *ps_op;
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+ WORD32 disp_wd, disp_ht, buffer_wd, buffer_ht, x_offset, y_offset;
+ ps_ip = (ihevcd_cxa_ctl_get_frame_dimensions_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_get_frame_dimensions_op_t *)pv_api_op;
+ UNUSED(ps_ip);
+ if(ps_codec->i4_sps_done)
+ {
+ disp_wd = ps_codec->i4_disp_wd;
+ disp_ht = ps_codec->i4_disp_ht;
+
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ buffer_wd = disp_wd;
+ buffer_ht = disp_ht;
+ }
+ else
+ {
+ buffer_wd = ps_codec->i4_strd;
+ buffer_ht = ps_codec->i4_ht + PAD_HT;
+ }
+ }
+ else
+ {
+
+ disp_wd = ps_codec->i4_max_wd;
+ disp_ht = ps_codec->i4_max_ht;
+
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ buffer_wd = disp_wd;
+ buffer_ht = disp_ht;
+ }
+ else
+ {
+ buffer_wd = ALIGN16(disp_wd) + PAD_WD;
+ buffer_ht = ALIGN16(disp_ht) + PAD_HT;
+
+ }
+ }
+ if(ps_codec->i4_strd > buffer_wd)
+ buffer_wd = ps_codec->i4_strd;
+
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ x_offset = 0;
+ y_offset = 0;
+ }
+ else
+ {
+ y_offset = PAD_TOP;
+ x_offset = PAD_LEFT;
+#if 0
+ if((NULL != ps_codec->ps_seqParams) && (1 == (ps_codec->ps_seqParams->u1_is_valid)) && (0 != ps_codec->u2_crop_offset_y))
+ {
+ y_offset += ps_codec->u2_crop_offset_y / ps_codec->i4_strd;
+ x_offset += ps_codec->u2_crop_offset_y % ps_codec->i4_strd;
+ }
+#endif
+ }
+
+ ps_op->u4_disp_wd[0] = disp_wd;
+ ps_op->u4_disp_ht[0] = disp_ht;
+ ps_op->u4_buffer_wd[0] = buffer_wd;
+ ps_op->u4_buffer_ht[0] = buffer_ht;
+ ps_op->u4_x_offset[0] = x_offset;
+ ps_op->u4_y_offset[0] = y_offset;
+
+ ps_op->u4_disp_wd[1] = ps_op->u4_disp_wd[2] = ((ps_op->u4_disp_wd[0] + 1)
+ >> 1);
+ ps_op->u4_disp_ht[1] = ps_op->u4_disp_ht[2] = ((ps_op->u4_disp_ht[0] + 1)
+ >> 1);
+ ps_op->u4_buffer_wd[1] = ps_op->u4_buffer_wd[2] = (ps_op->u4_buffer_wd[0]
+ >> 1);
+ ps_op->u4_buffer_ht[1] = ps_op->u4_buffer_ht[2] = (ps_op->u4_buffer_ht[0]
+ >> 1);
+ ps_op->u4_x_offset[1] = ps_op->u4_x_offset[2] = (ps_op->u4_x_offset[0]
+ >> 1);
+ ps_op->u4_y_offset[1] = ps_op->u4_y_offset[2] = (ps_op->u4_y_offset[0]
+ >> 1);
+
+ if((ps_codec->e_chroma_fmt == IV_YUV_420SP_UV)
+ || (ps_codec->e_chroma_fmt == IV_YUV_420SP_VU))
+ {
+ ps_op->u4_disp_wd[2] = 0;
+ ps_op->u4_disp_ht[2] = 0;
+ ps_op->u4_buffer_wd[2] = 0;
+ ps_op->u4_buffer_ht[2] = 0;
+ ps_op->u4_x_offset[2] = 0;
+ ps_op->u4_y_offset[2] = 0;
+
+ ps_op->u4_disp_wd[1] <<= 1;
+ ps_op->u4_buffer_wd[1] <<= 1;
+ ps_op->u4_x_offset[1] <<= 1;
+ }
+
+ return IV_SUCCESS;
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets vui parameters
+*
+* @par Description:
+* Gets VUI parameters
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_vui_params(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ ihevcd_cxa_ctl_get_vui_params_ip_t *ps_ip;
+ ihevcd_cxa_ctl_get_vui_params_op_t *ps_op;
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+ sps_t *ps_sps;
+ vui_t *ps_vui;
+ WORD32 i;
+
+ ps_ip = (ihevcd_cxa_ctl_get_vui_params_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_get_vui_params_op_t *)pv_api_op;
+
+ if(0 == ps_codec->i4_sps_done)
+ {
+ ps_op->u4_error_code = IHEVCD_VUI_PARAMS_NOT_FOUND;
+ return IV_FAIL;
+ }
+
+ ps_sps = ps_codec->s_parse.ps_sps;
+ if(0 == ps_sps->i1_sps_valid || 0 == ps_sps->i1_vui_parameters_present_flag)
+ {
+ WORD32 sps_idx = 0;
+ ps_sps = ps_codec->ps_sps_base;
+
+ while((0 == ps_sps->i1_sps_valid) || (0 == ps_sps->i1_vui_parameters_present_flag))
+ {
+ sps_idx++;
+ ps_sps++;
+
+ if(sps_idx == MAX_SPS_CNT - 1)
+ {
+ ps_op->u4_error_code = IHEVCD_VUI_PARAMS_NOT_FOUND;
+ return IV_FAIL;
+ }
+ }
+ }
+
+ ps_vui = &ps_sps->s_vui_parameters;
+ UNUSED(ps_ip);
+
+ ps_op->u1_aspect_ratio_info_present_flag = ps_vui->u1_aspect_ratio_info_present_flag;
+ ps_op->u1_aspect_ratio_idc = ps_vui->u1_aspect_ratio_idc;
+ ps_op->u2_sar_width = ps_vui->u2_sar_width;
+ ps_op->u2_sar_height = ps_vui->u2_sar_height;
+ ps_op->u1_overscan_info_present_flag = ps_vui->u1_overscan_info_present_flag;
+ ps_op->u1_overscan_appropriate_flag = ps_vui->u1_overscan_appropriate_flag;
+ ps_op->u1_video_signal_type_present_flag = ps_vui->u1_video_signal_type_present_flag;
+ ps_op->u1_video_format = ps_vui->u1_video_format;
+ ps_op->u1_video_full_range_flag = ps_vui->u1_video_full_range_flag;
+ ps_op->u1_colour_description_present_flag = ps_vui->u1_colour_description_present_flag;
+ ps_op->u1_colour_primaries = ps_vui->u1_colour_primaries;
+ ps_op->u1_transfer_characteristics = ps_vui->u1_transfer_characteristics;
+ ps_op->u1_matrix_coefficients = ps_vui->u1_matrix_coefficients;
+ ps_op->u1_chroma_loc_info_present_flag = ps_vui->u1_chroma_loc_info_present_flag;
+ ps_op->u1_chroma_sample_loc_type_top_field = ps_vui->u1_chroma_sample_loc_type_top_field;
+ ps_op->u1_chroma_sample_loc_type_bottom_field = ps_vui->u1_chroma_sample_loc_type_bottom_field;
+ ps_op->u1_neutral_chroma_indication_flag = ps_vui->u1_neutral_chroma_indication_flag;
+ ps_op->u1_field_seq_flag = ps_vui->u1_field_seq_flag;
+ ps_op->u1_frame_field_info_present_flag = ps_vui->u1_frame_field_info_present_flag;
+ ps_op->u1_default_display_window_flag = ps_vui->u1_default_display_window_flag;
+ ps_op->u4_def_disp_win_left_offset = ps_vui->u4_def_disp_win_left_offset;
+ ps_op->u4_def_disp_win_right_offset = ps_vui->u4_def_disp_win_right_offset;
+ ps_op->u4_def_disp_win_top_offset = ps_vui->u4_def_disp_win_top_offset;
+ ps_op->u4_def_disp_win_bottom_offset = ps_vui->u4_def_disp_win_bottom_offset;
+ ps_op->u1_vui_hrd_parameters_present_flag = ps_vui->u1_vui_hrd_parameters_present_flag;
+ ps_op->u1_vui_timing_info_present_flag = ps_vui->u1_vui_timing_info_present_flag;
+ ps_op->u4_vui_num_units_in_tick = ps_vui->u4_vui_num_units_in_tick;
+ ps_op->u4_vui_time_scale = ps_vui->u4_vui_time_scale;
+ ps_op->u1_poc_proportional_to_timing_flag = ps_vui->u1_poc_proportional_to_timing_flag;
+ ps_op->u1_num_ticks_poc_diff_one_minus1 = ps_vui->u1_num_ticks_poc_diff_one_minus1;
+ ps_op->u1_bitstream_restriction_flag = ps_vui->u1_bitstream_restriction_flag;
+ ps_op->u1_tiles_fixed_structure_flag = ps_vui->u1_tiles_fixed_structure_flag;
+ ps_op->u1_motion_vectors_over_pic_boundaries_flag = ps_vui->u1_motion_vectors_over_pic_boundaries_flag;
+ ps_op->u1_restricted_ref_pic_lists_flag = ps_vui->u1_restricted_ref_pic_lists_flag;
+ ps_op->u4_min_spatial_segmentation_idc = ps_vui->u4_min_spatial_segmentation_idc;
+ ps_op->u1_max_bytes_per_pic_denom = ps_vui->u1_max_bytes_per_pic_denom;
+ ps_op->u1_max_bits_per_mincu_denom = ps_vui->u1_max_bits_per_mincu_denom;
+ ps_op->u1_log2_max_mv_length_horizontal = ps_vui->u1_log2_max_mv_length_horizontal;
+ ps_op->u1_log2_max_mv_length_vertical = ps_vui->u1_log2_max_mv_length_vertical;
+
+
+ /* HRD parameters */
+ ps_op->u1_timing_info_present_flag = ps_vui->s_vui_hrd_parameters.u1_timing_info_present_flag;
+ ps_op->u4_num_units_in_tick = ps_vui->s_vui_hrd_parameters.u4_num_units_in_tick;
+ ps_op->u4_time_scale = ps_vui->s_vui_hrd_parameters.u4_time_scale;
+ ps_op->u1_nal_hrd_parameters_present_flag = ps_vui->s_vui_hrd_parameters.u1_nal_hrd_parameters_present_flag;
+ ps_op->u1_vcl_hrd_parameters_present_flag = ps_vui->s_vui_hrd_parameters.u1_vcl_hrd_parameters_present_flag;
+ ps_op->u1_cpbdpb_delays_present_flag = ps_vui->s_vui_hrd_parameters.u1_cpbdpb_delays_present_flag;
+ ps_op->u1_sub_pic_cpb_params_present_flag = ps_vui->s_vui_hrd_parameters.u1_sub_pic_cpb_params_present_flag;
+ ps_op->u1_tick_divisor_minus2 = ps_vui->s_vui_hrd_parameters.u1_tick_divisor_minus2;
+ ps_op->u1_du_cpb_removal_delay_increment_length_minus1 = ps_vui->s_vui_hrd_parameters.u1_du_cpb_removal_delay_increment_length_minus1;
+ ps_op->u1_sub_pic_cpb_params_in_pic_timing_sei_flag = ps_vui->s_vui_hrd_parameters.u1_sub_pic_cpb_params_in_pic_timing_sei_flag;
+ ps_op->u1_dpb_output_delay_du_length_minus1 = ps_vui->s_vui_hrd_parameters.u1_dpb_output_delay_du_length_minus1;
+ ps_op->u4_bit_rate_scale = ps_vui->s_vui_hrd_parameters.u4_bit_rate_scale;
+ ps_op->u4_cpb_size_scale = ps_vui->s_vui_hrd_parameters.u4_cpb_size_scale;
+ ps_op->u4_cpb_size_du_scale = ps_vui->s_vui_hrd_parameters.u4_cpb_size_du_scale;
+ ps_op->u1_initial_cpb_removal_delay_length_minus1 = ps_vui->s_vui_hrd_parameters.u1_initial_cpb_removal_delay_length_minus1;
+ ps_op->u1_au_cpb_removal_delay_length_minus1 = ps_vui->s_vui_hrd_parameters.u1_au_cpb_removal_delay_length_minus1;
+ ps_op->u1_dpb_output_delay_length_minus1 = ps_vui->s_vui_hrd_parameters.u1_dpb_output_delay_length_minus1;
+
+ for(i = 0; i < 6; i++)
+ {
+ ps_op->au1_fixed_pic_rate_general_flag[i] = ps_vui->s_vui_hrd_parameters.au1_fixed_pic_rate_general_flag[i];
+ ps_op->au1_fixed_pic_rate_within_cvs_flag[i] = ps_vui->s_vui_hrd_parameters.au1_fixed_pic_rate_within_cvs_flag[i];
+ ps_op->au1_elemental_duration_in_tc_minus1[i] = ps_vui->s_vui_hrd_parameters.au1_elemental_duration_in_tc_minus1[i];
+ ps_op->au1_low_delay_hrd_flag[i] = ps_vui->s_vui_hrd_parameters.au1_low_delay_hrd_flag[i];
+ ps_op->au1_cpb_cnt_minus1[i] = ps_vui->s_vui_hrd_parameters.au1_cpb_cnt_minus1[i];
+ }
+
+
+ return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets Processor type
+*
+* @par Description:
+* Sets Processor type
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_set_processor(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ ihevcd_cxa_ctl_set_processor_ip_t *ps_ip;
+ ihevcd_cxa_ctl_set_processor_op_t *ps_op;
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+ ps_ip = (ihevcd_cxa_ctl_set_processor_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_set_processor_op_t *)pv_api_op;
+
+ ps_codec->e_processor_arch = (IVD_ARCH_T)ps_ip->u4_arch;
+ ps_codec->e_processor_soc = (IVD_SOC_T)ps_ip->u4_soc;
+
+ ihevcd_init_function_ptr(ps_codec);
+
+ ihevcd_update_function_ptr(ps_codec);
+
+ if(ps_codec->e_processor_soc && (ps_codec->e_processor_soc <= SOC_HISI_37X))
+ {
+ /* 8th bit indicates if format conversion is to be done ahead */
+ if(ps_codec->e_processor_soc & 0x80)
+ ps_codec->u4_enable_fmt_conv_ahead = 1;
+
+ /* Lower 7 bit indicate NCTB - if non-zero */
+ ps_codec->e_processor_soc &= 0x7F;
+
+ if(ps_codec->e_processor_soc)
+ ps_codec->u4_nctb = ps_codec->e_processor_soc;
+
+
+ }
+
+ if((ps_codec->e_processor_soc == SOC_HISI_37X) && (ps_codec->i4_num_cores == 2))
+ {
+ ps_codec->u4_nctb = 2;
+ }
+
+
+ ps_op->u4_error_code = 0;
+ return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets Number of cores that can be used in the codec. Codec uses these many
+* threads for decoding
+*
+* @par Description:
+* Sets number of cores
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_set_num_cores(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ ihevcd_cxa_ctl_set_num_cores_ip_t *ps_ip;
+ ihevcd_cxa_ctl_set_num_cores_op_t *ps_op;
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+ ps_ip = (ihevcd_cxa_ctl_set_num_cores_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_set_num_cores_op_t *)pv_api_op;
+
+#ifdef MULTICORE
+ ps_codec->i4_num_cores = ps_ip->u4_num_cores;
+#else
+ ps_codec->i4_num_cores = 1;
+#endif
+ ps_op->u4_error_code = 0;
+ return IV_SUCCESS;
+}
+#ifdef GPU_BUILD
+/**
+*******************************************************************************
+*
+* @brief
+* Enables or disables GPU in run-time
+*
+* @par Description:
+*
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_gpu_enable_disable(iv_obj_t *ps_codec_obj,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ ihevcd_cxa_ctl_gpu_enable_diable_ip_t *ps_ip;
+ ihevcd_cxa_ctl_gpu_enable_diable_op_t *ps_op;
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+ ps_ip = (ihevcd_cxa_ctl_gpu_enable_diable_ip_t *)pv_api_ip;
+ ps_op = (ihevcd_cxa_ctl_gpu_enable_diable_op_t *)pv_api_op;
+
+#ifndef FRAME_STAGGER_ONLY
+ ps_codec->u4_gpu_enabled = ps_ip->u4_gpu_enable_diable;
+#endif
+ ps_op->u4_error_code = 0;
+ return IV_SUCCESS;
+}
+#endif
+/**
+*******************************************************************************
+*
+* @brief
+* Codec control call
+*
+* @par Description:
+* Codec control call which in turn calls appropriate calls based on
+* subcommand
+*
+* @param[in] ps_codec_obj
+* Pointer to codec object at API level
+*
+* @param[in] pv_api_ip
+* Pointer to input argument structure
+*
+* @param[out] pv_api_op
+* Pointer to output argument structure
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_ctl(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
+{
+ ivd_ctl_set_config_ip_t *ps_ctl_ip;
+ ivd_ctl_set_config_op_t *ps_ctl_op;
+ WORD32 ret = 0;
+ WORD32 subcommand;
+ codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle;
+
+ ps_ctl_ip = (ivd_ctl_set_config_ip_t *)pv_api_ip;
+ ps_ctl_op = (ivd_ctl_set_config_op_t *)pv_api_op;
+
+ if(ps_codec->i4_init_done != 1)
+ {
+ ps_ctl_op->u4_error_code |= 1 << IVD_FATALERROR;
+ ps_ctl_op->u4_error_code |= IHEVCD_INIT_NOT_DONE;
+ return IV_FAIL;
+ }
+ subcommand = ps_ctl_ip->e_sub_cmd;
+
+ switch(subcommand)
+ {
+ case IVD_CMD_CTL_GETPARAMS:
+ ret = ihevcd_get_status(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IVD_CMD_CTL_SETPARAMS:
+ ret = ihevcd_set_params(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IVD_CMD_CTL_RESET:
+ ret = ihevcd_reset(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IVD_CMD_CTL_SETDEFAULT:
+ {
+ ivd_ctl_set_config_op_t *s_ctl_dynparams_op =
+ (ivd_ctl_set_config_op_t *)pv_api_op;
+
+ ret = ihevcd_set_default_params(ps_codec);
+ if(IV_SUCCESS == ret)
+ s_ctl_dynparams_op->u4_error_code = 0;
+ break;
+ }
+ case IVD_CMD_CTL_FLUSH:
+ ret = ihevcd_set_flush_mode(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IVD_CMD_CTL_GETBUFINFO:
+ ret = ihevcd_get_buf_info(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IVD_CMD_CTL_GETVERSION:
+ {
+ ivd_ctl_getversioninfo_ip_t *ps_ip;
+ ivd_ctl_getversioninfo_op_t *ps_op;
+ IV_API_CALL_STATUS_T ret;
+ ps_ip = (ivd_ctl_getversioninfo_ip_t *)pv_api_ip;
+ ps_op = (ivd_ctl_getversioninfo_op_t *)pv_api_op;
+
+ ps_op->u4_error_code = IV_SUCCESS;
+
+ if((WORD32)ps_ip->u4_version_buffer_size <= 0)
+ {
+ ps_op->u4_error_code = IHEVCD_CXA_VERS_BUF_INSUFFICIENT;
+ ret = IV_FAIL;
+ }
+ else
+ {
+ ret = ihevcd_get_version((CHAR *)ps_ip->pv_version_buffer,
+ ps_ip->u4_version_buffer_size);
+ if(ret != IV_SUCCESS)
+ {
+ ps_op->u4_error_code = IHEVCD_CXA_VERS_BUF_INSUFFICIENT;
+ ret = IV_FAIL;
+ }
+ }
+ }
+ break;
+ case IHEVCD_CXA_CMD_CTL_DEGRADE:
+ ret = ihevcd_set_degrade(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IHEVCD_CXA_CMD_CTL_SET_NUM_CORES:
+ ret = ihevcd_set_num_cores(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS:
+ ret = ihevcd_get_frame_dimensions(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS:
+ ret = ihevcd_get_vui_params(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+ case IHEVCD_CXA_CMD_CTL_SET_PROCESSOR:
+ ret = ihevcd_set_processor(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+#ifdef GPU_BUILD
+ case IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE:
+ ret = ihevcd_gpu_enable_disable(ps_codec_obj, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+#endif
+ default:
+ DEBUG("\nDo nothing\n");
+ break;
+ }
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Codecs entry point function. All the function calls to the codec are
+* done using this function with different values specified in command
+*
+* @par Description:
+* Arguments are tested for validity and then based on the command
+* appropriate function is called
+*
+* @param[in] ps_handle
+* API level handle for codec
+*
+* @param[in] pv_api_ip
+* Input argument structure
+*
+* @param[out] pv_api_op
+* Output argument structure
+*
+* @returns Status of the function corresponding to command
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T ihevcd_cxa_api_function(iv_obj_t *ps_handle,
+ void *pv_api_ip,
+ void *pv_api_op)
+{
+ WORD32 command;
+ UWORD32 *pu4_ptr_cmd;
+ WORD32 ret = 0;
+ IV_API_CALL_STATUS_T e_status;
+ e_status = api_check_struct_sanity(ps_handle, pv_api_ip, pv_api_op);
+
+ if(e_status != IV_SUCCESS)
+ {
+ DEBUG("error code = %d\n", *((UWORD32 *)pv_api_op + 1));
+ return IV_FAIL;
+ }
+
+ pu4_ptr_cmd = (UWORD32 *)pv_api_ip;
+ pu4_ptr_cmd++;
+
+ command = *pu4_ptr_cmd;
+
+ switch(command)
+ {
+ case IV_CMD_GET_NUM_MEM_REC:
+ ret = ihevcd_get_num_rec((void *)pv_api_ip, (void *)pv_api_op);
+
+ break;
+ case IV_CMD_FILL_NUM_MEM_REC:
+
+ ret = ihevcd_fill_num_mem_rec((void *)pv_api_ip, (void *)pv_api_op);
+ break;
+ case IV_CMD_INIT:
+ ret = ihevcd_init_mem_rec(ps_handle, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+
+ case IVD_CMD_VIDEO_DECODE:
+ ret = ihevcd_decode(ps_handle, (void *)pv_api_ip, (void *)pv_api_op);
+ break;
+
+ case IVD_CMD_GET_DISPLAY_FRAME:
+ //ret = ihevcd_get_display_frame(ps_handle,(void *)pv_api_ip,(void *)pv_api_op);
+ break;
+
+ case IVD_CMD_SET_DISPLAY_FRAME:
+ ret = ihevcd_set_display_frame(ps_handle, (void *)pv_api_ip,
+ (void *)pv_api_op);
+
+ break;
+
+ case IVD_CMD_REL_DISPLAY_FRAME:
+ ret = ihevcd_rel_display_frame(ps_handle, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+
+ case IV_CMD_RETRIEVE_MEMREC:
+ ret = ihevcd_retrieve_memrec(ps_handle, (void *)pv_api_ip,
+ (void *)pv_api_op);
+ break;
+
+ case IVD_CMD_VIDEO_CTL:
+ ret = ihevcd_ctl(ps_handle, (void *)pv_api_ip, (void *)pv_api_op);
+ break;
+ default:
+ ret = IV_FAIL;
+ break;
+ }
+
+ return (IV_API_CALL_STATUS_T)ret;
+}
+
diff --git a/decoder/ihevcd_bitstream.c b/decoder/ihevcd_bitstream.c
new file mode 100644
index 0000000..be9addb
--- /dev/null
+++ b/decoder/ihevcd_bitstream.c
@@ -0,0 +1,580 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_bitstream.c
+*
+* @brief
+* Contains functions for bitstream access
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+* - ihevcd_bits_init()
+* - ihevcd_bits_flush()
+* - ihevcd_bits_flush_to_byte_boundary()
+* - ihevcd_bits_nxt()
+* - ihevcd_bits_nxt32()
+* - ihevcd_bits_get()
+* - ihevcd_bits_num_bits_remaining()
+* - ihevcd_bits_num_bits_consumed()
+* - ihevcd_sev()
+* - ihevcd_uev()
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+
+/*****************************************************************************/
+/* Function Prototypes */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+* Function used for bitstream structure initialization
+*
+* @par Description:
+* Initialize bitstream structure elements
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] pu1_buf
+* Pointer to bitstream data
+*
+* @param[in] u4_numbytes
+* Number of bytes in bitstream
+*
+* @returns none
+*
+* @remarks
+* Assumes pu1_buf is aligned to 4 bytes. If not aligned then all bitstream
+* accesses will be unaligned and hence costlier. Since this is codec memory
+* that holds emulation prevented data, assumption of aligned to 4 bytes is
+* valid
+*
+*******************************************************************************
+*/
+void ihevcd_bits_init(bitstrm_t *ps_bitstrm,
+ UWORD8 *pu1_buf,
+ UWORD32 u4_numbytes)
+{
+ UWORD32 u4_cur_word;
+ UWORD32 u4_nxt_word;
+ UWORD32 u4_temp;
+ UWORD32 *pu4_buf;
+
+ pu4_buf = (UWORD32 *)pu1_buf;
+ u4_temp = *pu4_buf++;
+ u4_cur_word = ITT_BIG_ENDIAN(u4_temp);
+ u4_temp = *pu4_buf++;
+ u4_nxt_word = ITT_BIG_ENDIAN(u4_temp);
+
+ ps_bitstrm->u4_bit_ofst = 0;
+ ps_bitstrm->pu1_buf_base = pu1_buf;
+ ps_bitstrm->pu4_buf = pu4_buf;
+ ps_bitstrm->u4_cur_word = u4_cur_word;
+ ps_bitstrm->u4_nxt_word = u4_nxt_word;
+
+ ps_bitstrm->pu1_buf_max = pu1_buf + u4_numbytes + 8;
+
+ return;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Flushes given number of bits. Bits consumed increases by this number
+*
+* @par Description:
+* Increment bit offset by numbits. If bit offset increases beyond 32, then
+* move nxt_word to cur_word, read next word32 to nxt_word after endian
+* conversion
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+* Number of bits to be flushed
+*
+* @returns None
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+void ihevcd_bits_flush(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+
+ BITS_FLUSH(ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ u4_numbits);
+
+ return;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Flushes to next byte boundary.Bits consumed increases by this number
+*
+* @par Description:
+* Compute number of bits remaining in the current byte then call
+* ihevcd_bits_flush() bits with this number
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @returns None
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+void ihevcd_bits_flush_to_byte_boundary(bitstrm_t *ps_bitstrm)
+{
+ UWORD32 u4_numbits;
+ u4_numbits = (ps_bitstrm->u4_bit_ofst) & 7;
+
+ u4_numbits = 8 - u4_numbits;
+
+ BITS_FLUSH(ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ u4_numbits);
+
+ return;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Seeks by given number of bits in the bitstream from current position
+*
+* @par Description:
+* Add given number of bits to bitstream offset and update pu4_buf, cur_word and
+* nxt_word accordingly
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] numbits
+* Number of bits to seek
+*
+* @returns None
+*
+* @remarks
+* Assumes emulation prevention has been done before and the buffer does not
+* contain any emulation prevention bytes
+*
+*******************************************************************************
+*/
+void ihevcd_bits_seek(bitstrm_t *ps_bitstrm, WORD32 numbits)
+{
+ WORD32 val;
+ ASSERT(numbits >= -32);
+ ASSERT(numbits <= 32);
+ /* Check if Seeking backwards*/
+ if(numbits < 0)
+ {
+ UWORD32 abs_numbits = -numbits;
+ if(ps_bitstrm->u4_bit_ofst >= abs_numbits)
+ {
+ /* If the current offset is greater than number of bits to seek back,
+ * then subtract abs_numbits from offset and return.
+ */
+ ps_bitstrm->u4_bit_ofst -= abs_numbits;
+ return;
+ }
+ else
+ {
+ /* If the current offset is lesser than number of bits to seek back,
+ * then subtract abs_numbits from offset and add 32 and move cur_word to nxt_word
+ * and load cur_word appropriately and decrement pu4_buf
+ */
+ ps_bitstrm->u4_bit_ofst -= abs_numbits;
+ ps_bitstrm->u4_bit_ofst += 32;
+ ps_bitstrm->pu4_buf--;
+
+ val = *(ps_bitstrm->pu4_buf - 2);
+ ps_bitstrm->u4_nxt_word = ps_bitstrm->u4_cur_word;
+ ps_bitstrm->u4_cur_word = ITT_BIG_ENDIAN(val);
+ return;
+ }
+ }
+ else
+ {
+ /* Not supported/tested currently */
+ ASSERT(1);
+ BITS_FLUSH(ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ numbits);
+
+
+ }
+ return;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Snoops for next numbits number of bits from the bitstream this does not
+* update the bitstream offset and does not consume the bits
+*
+* @par Description:
+* Extract required number of bits from cur_word & nxt_word return these
+* bits
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+* Number of bits
+*
+* @returns Next u4_numbits number of bits
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_nxt(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+ UWORD32 u4_bits_read;
+
+ BITS_NXT(u4_bits_read,
+ ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ u4_numbits);
+ return u4_bits_read;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Snoops for next 32 bits from the bitstream this does not update the
+* bitstream offset and does not consume the bits
+*
+* @par Description:
+* Extract required number of bits from cur_word & nxt_word return these
+* bits
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+* Number of bits
+*
+* @returns Next 32 bits
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_nxt32(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+ UWORD32 u4_bits_read;
+ UNUSED(u4_numbits);
+ BITS_NXT32(u4_bits_read,
+ ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word);
+ return u4_bits_read;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Reads next numbits number of bits from the bitstream this updates the
+* bitstream offset and consumes the bits
+*
+* @par Description:
+* Extract required number of bits from cur_word & nxt_word return these
+* bits
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] u4_numbits
+* Number of bits
+*
+* @returns Bits read
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_get(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits)
+{
+ UWORD32 u4_bits_read;
+
+ BITS_GET(u4_bits_read,
+ ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ u4_numbits);
+ return u4_bits_read;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Returns the number of bits remaining in the bitstream
+*
+* @par Description:
+* Compute number of bits remaining based on current pointer and buffer base
+* and current offset. Since 8 bytes are read at the start into cur_word and
+* nxt_word and are not consumed, 8 has to be subtracted
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @returns Total number of bits remaining
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_num_bits_remaining(bitstrm_t *ps_bitstrm)
+{
+ UWORD32 u4_bits_consumed;
+ UWORD32 u4_size_in_bits;
+
+ /* 8 bytes are read in cur_word and nxt_word at the start. Hence */
+ /* subtract 8 bytes */
+ u4_bits_consumed = (UWORD32)(((UWORD8 *)ps_bitstrm->pu4_buf -
+ (UWORD8 *)ps_bitstrm->pu1_buf_base - 8) <<
+ 3) + ps_bitstrm->u4_bit_ofst;
+
+ u4_size_in_bits = (UWORD32)(ps_bitstrm->pu1_buf_max -
+ ps_bitstrm->pu1_buf_base);
+ return (u4_size_in_bits - u4_bits_consumed);
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Returns the number of bits consumed in the bitstream
+*
+* @par Description:
+* Compute number of bits consumed based on current pointer and buffer base
+* and current offset. Since 8 bytes are read at the start into cur_word and
+* nxt_word and are not consumed, 8 has to be subtracted
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @returns Total number of bits bits consumed
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_bits_num_bits_consumed(bitstrm_t *ps_bitstrm)
+{
+ UWORD32 u4_bits_consumed;
+ /* 8 bytes are read in cur_word and nxt_word at the start. Hence */
+ /* subtract 8 bytes */
+
+ u4_bits_consumed = (UWORD32)(((UWORD8 *)ps_bitstrm->pu4_buf -
+ (UWORD8 *)ps_bitstrm->pu1_buf_base - 8) <<
+ 3) + ps_bitstrm->u4_bit_ofst;
+ return u4_bits_consumed;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Reads unsigned integer 0-th order exp-golomb-coded syntax element from
+* the bitstream Section: 9.2
+*
+* @par Description:
+* Extract required number of bits from cur_word & nxt_word return these
+* bits
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @returns UEV decoded syntax element
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+UWORD32 ihevcd_uev(bitstrm_t *ps_bitstrm)
+{
+ UWORD32 u4_bits_read;
+ UWORD32 u4_clz;
+
+
+ /***************************************************************/
+ /* Find leading zeros in next 32 bits */
+ /***************************************************************/
+ BITS_NXT32(u4_bits_read,
+ ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word);
+
+
+ u4_clz = CLZ(u4_bits_read);
+
+ BITS_FLUSH(ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ (u4_clz + 1));
+
+ u4_bits_read = 0;
+ if(u4_clz)
+ {
+ BITS_GET(u4_bits_read,
+ ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ u4_clz);
+ }
+ return ((1 << u4_clz) + u4_bits_read - 1);
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Reads signed integer 0-th order exp-golomb-coded syntax element from the
+* bitstream. Function similar to get_uev Section: 9.2.1
+*
+* @par Description:
+* Extract required number of bits from cur_word & nxt_word return these
+* bits
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @returns UEV decoded syntax element
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_sev(bitstrm_t *ps_bitstrm)
+{
+ UWORD32 u4_bits_read;
+ UWORD32 u4_clz;
+ UWORD32 u4_abs_val;
+
+
+ /***************************************************************/
+ /* Find leading zeros in next 32 bits */
+ /***************************************************************/
+ BITS_NXT32(u4_bits_read,
+ ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word);
+
+
+ u4_clz = CLZ(u4_bits_read);
+
+ BITS_FLUSH(ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ (u4_clz + 1));
+
+ u4_bits_read = 0;
+ if(u4_clz)
+ {
+ BITS_GET(u4_bits_read,
+ ps_bitstrm->pu4_buf,
+ ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word,
+ ps_bitstrm->u4_nxt_word,
+ u4_clz);
+ }
+ u4_abs_val = ((1 << u4_clz) + u4_bits_read) >> 1;
+ if(u4_bits_read & 0x1)
+ return (-(WORD32)u4_abs_val);
+ else
+ return (u4_abs_val);
+}
+
+
+
+
+
+
diff --git a/decoder/ihevcd_bitstream.h b/decoder/ihevcd_bitstream.h
new file mode 100644
index 0000000..907c934
--- /dev/null
+++ b/decoder/ihevcd_bitstream.h
@@ -0,0 +1,226 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_bitps_bitstrm.h
+*
+* @brief
+* Header for bitps_bitstrm access functions
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_BITSTREAM_H_
+#define _IHEVCD_BITSTREAM_H_
+/**
+ * @brief defines the maximum number of bits in a bitstream word
+ */
+#define WORD_SIZE 32
+/**
+ * @brief Twice the WORD_SIZE
+ */
+#define DBL_WORD_SIZE (2 * (WORD_SIZE))
+
+/**
+ * @brief WORD_SIZE - 1
+ */
+#define WORD_SIZE_MINUS1 (WORD_SIZE - 1)
+
+/**
+******************************************************************************
+* @brief Macro used to copy elements in bistream structure to local variables.
+******************************************************************************
+*/
+
+#define GET_STREAM(m_ps_bitstrm, m_pu4_buf, m_u4_bit_ofst, \
+ m_u4_cur_word, m_u4_nxt_word) \
+{ \
+ m_pu4_buf = m_ps_bitstrm->pu4_buf; \
+ m_u4_bit_ofst = m_ps_bitstrm->u4_bit_ofst; \
+ m_u4_cur_word = m_ps_bitstrm->u4_cur_word; \
+ m_u4_nxt_word = m_ps_bitstrm->u4_nxt_word; \
+}
+
+/**
+******************************************************************************
+* @brief Macro used to copy local variables to elements in bistream structure.
+******************************************************************************
+*/
+#define SET_STREAM(m_ps_bitstrm, m_pu4_buf, m_u4_bit_ofst, \
+ m_u4_cur_word, m_u4_nxt_word) \
+{ \
+ m_ps_bitstrm->pu4_buf = m_pu4_buf; \
+ m_ps_bitstrm->u4_bit_ofst = m_u4_bit_ofst; \
+ m_ps_bitstrm->u4_cur_word = m_u4_cur_word; \
+ m_ps_bitstrm->u4_nxt_word = m_u4_nxt_word; \
+}
+
+
+
+/**
+******************************************************************************
+* @brief Snoop next m_cnt bits without updating offsets or buffer increments.
+* Data is not consumed in this call
+******************************************************************************
+*/
+#define BITS_NXT(m_u4_bits, m_pu4_buf, m_u4_bit_ofst, \
+ m_u4_cur_word, m_u4_nxt_word, m_cnt) \
+{ \
+ m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst) >> \
+ (WORD_SIZE - m_cnt); \
+ \
+ if(m_u4_bit_ofst > (WORD_SIZE - m_cnt)) \
+ { \
+ m_u4_bits |= SHR(m_u4_nxt_word, \
+ (WORD_SIZE + WORD_SIZE - m_cnt \
+ - m_u4_bit_ofst)); \
+ } \
+}
+
+
+/**
+******************************************************************************
+* @brief Snoop next 32 bits without updating offsets or buffer increments.
+* Data is not consumed in this call
+******************************************************************************
+*/
+#define BITS_NXT32(m_u4_bits, m_pu4_buf, m_u4_bit_ofst, \
+ m_u4_cur_word, m_u4_nxt_word) \
+{ \
+ m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst); \
+ \
+ m_u4_bits |= SHR(m_u4_nxt_word, (WORD_SIZE - m_u4_bit_ofst)); \
+}
+
+
+/**
+******************************************************************************
+* @brief Flush m_u4_bits and updated the buffer pointer.
+* Data is consumed
+******************************************************************************
+*/
+#define BITS_FLUSH(m_pu4_buf, m_u4_bit_ofst, m_u4_cur_word, \
+ m_u4_nxt_word, m_cnt) \
+{ \
+ UWORD32 temp; \
+ \
+ m_u4_bit_ofst += m_cnt; \
+ if( m_u4_bit_ofst >= WORD_SIZE ) \
+ { \
+ m_u4_cur_word = m_u4_nxt_word; \
+ /* Getting the next word */ \
+ temp = *(m_pu4_buf++); \
+ \
+ m_u4_bit_ofst -= WORD_SIZE; \
+ /* Swapping little endian to big endian conversion*/\
+ m_u4_nxt_word = ITT_BIG_ENDIAN(temp); \
+ } \
+}
+/**
+******************************************************************************
+* @brief Get m_cnt number of bits and update bffer pointers and offset.
+* Data is consumed
+******************************************************************************
+*/
+#define BITS_GET(m_u4_bits, m_pu4_buf, m_u4_bit_ofst, \
+ m_u4_cur_word,m_u4_nxt_word, m_cnt) \
+{ \
+ m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst) \
+ >> (WORD_SIZE - m_cnt); \
+ m_u4_bit_ofst += m_cnt; \
+ if(m_u4_bit_ofst > WORD_SIZE) \
+ { \
+ m_u4_bits |= SHR(m_u4_nxt_word, \
+ (DBL_WORD_SIZE - m_u4_bit_ofst)); \
+ } \
+ \
+ if( m_u4_bit_ofst >= WORD_SIZE ) \
+ { \
+ UWORD32 pu4_word_tmp; \
+ m_u4_cur_word = m_u4_nxt_word; \
+ /* Getting the next word */ \
+ pu4_word_tmp = *(m_pu4_buf++); \
+ \
+ m_u4_bit_ofst -= WORD_SIZE; \
+ /* Swapping little endian to big endian conversion*/ \
+ m_u4_nxt_word = ITT_BIG_ENDIAN(pu4_word_tmp); \
+ } \
+}
+
+/**
+******************************************************************************
+* @brief Get 1 bit and update buffer pointers and offset.
+* Data is consumed
+******************************************************************************
+*/
+
+#define BIT_GET(m_u4_bits,m_pu4_buf,m_u4_bit_ofst, \
+ m_u4_cur_word,m_u4_nxt_word) \
+{ \
+ m_u4_bits = (m_u4_cur_word << m_u4_bit_ofst) \
+ >> (WORD_SIZE_MINUS1); \
+ m_u4_bit_ofst++; \
+ \
+ if(m_u4_bit_ofst == WORD_SIZE) \
+ { \
+ UWORD32 pu4_word_tmp; \
+ m_u4_cur_word = m_u4_nxt_word; \
+ /* Getting the next word */ \
+ pu4_word_tmp = *m_pu4_buf++; \
+ \
+ m_u4_bit_ofst = 0; \
+ /* Swapping little endian to big endian conversion*/ \
+ m_u4_nxt_word = ITT_BIG_ENDIAN(pu4_word_tmp); \
+ } \
+}
+
+void ihevcd_bits_init(bitstrm_t *ps_bitstrm,
+ UWORD8 *pu1_buf,
+ UWORD32 u4_numbytes);
+void ihevcd_bits_flush(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+void ihevcd_bits_flush_to_byte_boundary(bitstrm_t *ps_bitstrm);
+
+UWORD32 ihevcd_bits_nxt(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+UWORD32 ihevcd_bits_nxt32(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+
+UWORD32 ihevcd_bits_get(bitstrm_t *ps_bitstrm, UWORD32 u4_numbits);
+
+UWORD32 ihevcd_bits_num_bits_remaining(bitstrm_t *ps_bitstrm);
+
+
+UWORD32 ihevcd_bits_num_bits_consumed(bitstrm_t *ps_bitstrm);
+
+UWORD32 ihevcd_uev(bitstrm_t *ps_bitstrm);
+
+WORD32 ihevcd_sev(bitstrm_t *ps_bitstrm);
+
+void ihevcd_bits_seek(bitstrm_t *ps_bitstrm, WORD32 numbits);
+
+#endif /* _IHEVCD_BITSTREAM_H_ */
diff --git a/decoder/ihevcd_boundary_strength.c b/decoder/ihevcd_boundary_strength.c
new file mode 100644
index 0000000..9451e70
--- /dev/null
+++ b/decoder/ihevcd_boundary_strength.c
@@ -0,0 +1,1008 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_boundary_strength.c
+ *
+ * @brief
+ * Contains functions for computing boundary strength
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_profile.h"
+
+/*****************************************************************************/
+/* Function Prototypes */
+/*****************************************************************************/
+
+
+#define SET_NGBHR_ALL_AVAIL(avail) avail = 0x1F;
+
+#define SET_NGBHR_BOTLEFT_NOTAVAIL(avail) avail &= ~0x10;
+#define SET_NGBHR_LEFT_NOTAVAIL(avail) avail &= ~0x8;
+#define SET_NGBHR_TOPLEFT_NOTAVAIL(avail) avail &= ~0x4;
+#define SET_NGBHR_TOP_NOTAVAIL(avail) avail &= ~0x2;
+#define SET_NGBHR_TOPRIGHT_NOTAVAIL(avail) avail &= ~0x1;
+
+WORD32 ihevcd_pu_boundary_strength(pu_t *ps_pu,
+ pu_t *ps_ngbr_pu)
+{
+ WORD32 i4_bs;
+ UWORD32 l0_ref_pic_buf_id, l1_ref_pic_buf_id;
+ UWORD32 ngbr_l0_ref_pic_buf_id, ngbr_l1_ref_pic_buf_id;
+
+ WORD16 i2_mv_x0, i2_mv_y0, i2_mv_x1, i2_mv_y1;
+ WORD16 i2_ngbr_mv_x0, i2_ngbr_mv_y0, i2_ngbr_mv_x1, i2_ngbr_mv_y1;
+
+ WORD32 num_mv, ngbr_num_mv;
+
+ num_mv = (PRED_BI == ps_pu->b2_pred_mode) ? 2 : 1;
+ ngbr_num_mv = (PRED_BI == ps_ngbr_pu->b2_pred_mode) ? 2 : 1;
+
+ l0_ref_pic_buf_id = ps_pu->mv.i1_l0_ref_pic_buf_id;
+ l1_ref_pic_buf_id = ps_pu->mv.i1_l1_ref_pic_buf_id;
+ ngbr_l0_ref_pic_buf_id = ps_ngbr_pu->mv.i1_l0_ref_pic_buf_id;
+ ngbr_l1_ref_pic_buf_id = ps_ngbr_pu->mv.i1_l1_ref_pic_buf_id;
+
+
+ i2_mv_x0 = ps_pu->mv.s_l0_mv.i2_mvx;
+ i2_mv_y0 = ps_pu->mv.s_l0_mv.i2_mvy;
+ i2_mv_x1 = ps_pu->mv.s_l1_mv.i2_mvx;
+ i2_mv_y1 = ps_pu->mv.s_l1_mv.i2_mvy;
+
+ i2_ngbr_mv_x0 = ps_ngbr_pu->mv.s_l0_mv.i2_mvx;
+ i2_ngbr_mv_y0 = ps_ngbr_pu->mv.s_l0_mv.i2_mvy;
+ i2_ngbr_mv_x1 = ps_ngbr_pu->mv.s_l1_mv.i2_mvx;
+ i2_ngbr_mv_y1 = ps_ngbr_pu->mv.s_l1_mv.i2_mvy;
+
+
+ /* If two motion vectors are used */
+ if((2 == num_mv) &&
+ (2 == ngbr_num_mv))
+ {
+ if((l0_ref_pic_buf_id == ngbr_l0_ref_pic_buf_id && l1_ref_pic_buf_id == ngbr_l1_ref_pic_buf_id) ||
+ (l0_ref_pic_buf_id == ngbr_l1_ref_pic_buf_id && l1_ref_pic_buf_id == ngbr_l0_ref_pic_buf_id))
+ {
+ if(l0_ref_pic_buf_id != l1_ref_pic_buf_id) /* Different L0 and L1 */
+ {
+ if(l0_ref_pic_buf_id == ngbr_l0_ref_pic_buf_id)
+ {
+ i4_bs = (ABS(i2_mv_x0 - i2_ngbr_mv_x0) < 4) &&
+ (ABS(i2_mv_y0 - i2_ngbr_mv_y0) < 4) &&
+ (ABS(i2_mv_x1 - i2_ngbr_mv_x1) < 4) &&
+ (ABS(i2_mv_y1 - i2_ngbr_mv_y1) < 4) ? 0 : 1;
+ }
+ else
+ {
+ i4_bs = (ABS(i2_mv_x0 - i2_ngbr_mv_x1) < 4) &&
+ (ABS(i2_mv_y0 - i2_ngbr_mv_y1) < 4) &&
+ (ABS(i2_mv_x1 - i2_ngbr_mv_x0) < 4) &&
+ (ABS(i2_mv_y1 - i2_ngbr_mv_y0) < 4) ? 0 : 1;
+ }
+ }
+ else /* Same L0 and L1 */
+ {
+ i4_bs = ((ABS(i2_mv_x0 - i2_ngbr_mv_x0) >= 4) ||
+ (ABS(i2_mv_y0 - i2_ngbr_mv_y0) >= 4) ||
+ (ABS(i2_mv_x1 - i2_ngbr_mv_x1) >= 4) ||
+ (ABS(i2_mv_y1 - i2_ngbr_mv_y1) >= 4)) &&
+ ((ABS(i2_mv_x0 - i2_ngbr_mv_x1) >= 4) ||
+ (ABS(i2_mv_y0 - i2_ngbr_mv_y1) >= 4) ||
+ (ABS(i2_mv_x1 - i2_ngbr_mv_x0) >= 4) ||
+ (ABS(i2_mv_y1 - i2_ngbr_mv_y0) >= 4)) ? 1 : 0;
+ }
+ }
+ else /* If the reference pictures used are different */
+ {
+ i4_bs = 1;
+ }
+ }
+
+ /* If one motion vector is used in both PUs */
+ else if((1 == num_mv) &&
+ (1 == ngbr_num_mv))
+ {
+ WORD16 i2_mv_x, i2_mv_y;
+ WORD16 i2_ngbr_mv_x, i2_ngbr_mv_y;
+ UWORD32 ref_pic_buf_id, ngbr_ref_pic_buf_id;
+
+ if(PRED_L0 == ps_pu->b2_pred_mode)
+ {
+ i2_mv_x = i2_mv_x0;
+ i2_mv_y = i2_mv_y0;
+ ref_pic_buf_id = l0_ref_pic_buf_id;
+ }
+ else
+ {
+ i2_mv_x = i2_mv_x1;
+ i2_mv_y = i2_mv_y1;
+ ref_pic_buf_id = l1_ref_pic_buf_id;
+ }
+
+ if(PRED_L0 == ps_ngbr_pu->b2_pred_mode)
+ {
+ i2_ngbr_mv_x = i2_ngbr_mv_x0;
+ i2_ngbr_mv_y = i2_ngbr_mv_y0;
+ ngbr_ref_pic_buf_id = ngbr_l0_ref_pic_buf_id;
+ }
+ else
+ {
+ i2_ngbr_mv_x = i2_ngbr_mv_x1;
+ i2_ngbr_mv_y = i2_ngbr_mv_y1;
+ ngbr_ref_pic_buf_id = ngbr_l1_ref_pic_buf_id;
+ }
+
+ i4_bs = (ref_pic_buf_id == ngbr_ref_pic_buf_id) &&
+ (ABS(i2_mv_x - i2_ngbr_mv_x) < 4) &&
+ (ABS(i2_mv_y - i2_ngbr_mv_y) < 4) ? 0 : 1;
+ }
+
+ /* If the no. of motion vectors is not the same */
+ else
+ {
+ i4_bs = 1;
+ }
+
+
+ return i4_bs;
+}
+
+/* QP is also populated in the same function */
+WORD32 ihevcd_ctb_boundary_strength_islice(bs_ctxt_t *ps_bs_ctxt)
+{
+ pps_t *ps_pps;
+ sps_t *ps_sps;
+ tu_t *ps_tu;
+ UWORD32 *pu4_vert_bs;
+ UWORD32 *pu4_horz_bs;
+ WORD32 vert_bs_strd;
+ WORD32 horz_bs_strd;
+ WORD32 vert_bs0_tmp;
+ WORD32 horz_bs0_tmp;
+ UWORD8 *pu1_qp;
+ WORD32 qp_strd;
+ UWORD32 u4_qp_const_in_ctb;
+ WORD32 ctb_indx;
+ WORD32 i4_tu_cnt;
+ WORD32 log2_ctb_size;
+ WORD32 ctb_size;
+
+ WORD8 i1_loop_filter_across_tiles_enabled_flag;
+ WORD8 i1_loop_filter_across_slices_enabled_flag;
+
+ WORD32 i;
+
+ PROFILE_DISABLE_BOUNDARY_STRENGTH();
+
+ ps_pps = ps_bs_ctxt->ps_pps;
+ ps_sps = ps_bs_ctxt->ps_sps;
+ i1_loop_filter_across_tiles_enabled_flag = ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+ i1_loop_filter_across_slices_enabled_flag = ps_bs_ctxt->ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag;
+ i4_tu_cnt = ps_bs_ctxt->i4_ctb_tu_cnt;
+
+ log2_ctb_size = ps_sps->i1_log2_ctb_size;
+ ctb_size = (1 << log2_ctb_size);
+
+ /* strides are in units of number of bytes */
+ /* ctb_size * ctb_size / 8 / 16 is the number of bytes needed per CTB */
+ vert_bs_strd = ps_sps->i2_pic_wd_in_ctb << (2 * log2_ctb_size - 7);
+ horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) << (2 * log2_ctb_size - 7);
+ pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_vert_bs +
+ (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+ ps_bs_ctxt->i4_ctb_y * vert_bs_strd);
+ pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_horz_bs +
+ (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+ ps_bs_ctxt->i4_ctb_y * horz_bs_strd);
+
+ /* ctb_size/8 elements per CTB */
+ qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3);
+ pu1_qp = ps_bs_ctxt->pu1_pic_qp + ((ps_bs_ctxt->i4_ctb_x + ps_bs_ctxt->i4_ctb_y * qp_strd) << (log2_ctb_size - 3));
+
+ ctb_indx = ps_bs_ctxt->i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_bs_ctxt->i4_ctb_y;
+ u4_qp_const_in_ctb = ps_bs_ctxt->pu1_pic_qp_const_in_ctb[ctb_indx >> 3] & (1 << (ctb_indx & 7));
+
+ vert_bs0_tmp = pu4_vert_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+ horz_bs0_tmp = pu4_horz_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+
+ /* ctb_size/8 is the number of edges per CTB
+ * ctb_size/4 is the number of BS values needed per edge
+ * divided by 8 for the number of bytes
+ * 2 is the number of bits needed for each BS value */
+/*
+ memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2 );
+ memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2 );
+*/
+ memset(pu4_vert_bs, 0, (1 << (2 * log2_ctb_size - 7)) + ((ctb_size >> 5) << 1));
+ memset(pu4_horz_bs, 0, (1 << (2 * log2_ctb_size - 7)));
+
+ /* pu4_vert_bs[0] has information about the left CTB which is not required when ctb_x = 0 */
+ if(0 != ps_bs_ctxt->i4_ctb_x)
+ {
+ pu4_vert_bs[0] |= vert_bs0_tmp;
+ }
+
+ /* pu4_horz_bs[0] has information about the top CTB which is not required when ctb_y = 0 */
+ if(0 != ps_bs_ctxt->i4_ctb_y)
+ {
+ pu4_horz_bs[0] |= horz_bs0_tmp;
+ }
+
+ ps_tu = ps_bs_ctxt->ps_tu;
+
+ /* Populating the QP array - if const_qp_in_ctb flag is one, set only the first element */
+ if(u4_qp_const_in_ctb)
+ pu1_qp[0] = ps_tu->b7_qp;
+
+ for(i = 0; i < i4_tu_cnt; i++)
+ {
+ WORD32 start_pos_x;
+ WORD32 start_pos_y;
+ WORD32 tu_size;
+
+
+ UWORD32 u4_bs;
+ ps_tu = ps_bs_ctxt->ps_tu + i;
+
+ /* start_pos_x and start_pos_y are in units of min TU size (4x4) */
+ start_pos_x = ps_tu->b4_pos_x;
+ start_pos_y = ps_tu->b4_pos_y;
+
+ tu_size = 1 << (ps_tu->b3_size + 2);
+ tu_size >>= 2; /* TU size divided by 4 */
+
+ u4_bs = DUP_LSB_10(tu_size);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_x & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_y * 2;
+ /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+ pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_y & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_x * 2;
+ /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+ pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+
+ /* Populating the QP array */
+ if(0 == u4_qp_const_in_ctb)
+ {
+ if(0 == (start_pos_x & 1) && 0 == (start_pos_y & 1))
+ {
+ WORD32 row, col;
+ for(row = start_pos_y; row < start_pos_y + tu_size; row += 2)
+ {
+ for(col = start_pos_x; col < start_pos_x + tu_size; col += 2)
+ {
+ pu1_qp[(row >> 1) * qp_strd + (col >> 1)] = ps_tu->b7_qp;
+ }
+ }
+ }
+ }
+
+ }
+ {
+ /*Determine if the slice is dependent, and is its left neighbor belongs to the same slice, in a different tile*/
+ UWORD32 ctb_addr;
+ WORD32 slice_idx, left_slice_idx = -1, top_slice_idx = -1;
+ /* If left neighbor is not available, then set BS for entire first column to zero */
+ if(!ps_pps->i1_tiles_enabled_flag)
+ {
+ if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x) ||
+ (0 == i1_loop_filter_across_slices_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) ||
+ (0 == ps_bs_ctxt->i4_ctb_x))
+ {
+ pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ }
+ else
+ {
+ //If across-tiles is disabled
+ if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x))
+ {
+ pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ else
+ {
+ ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+ if(ps_bs_ctxt->i4_ctb_x)
+ {
+ ctb_addr = (ps_bs_ctxt->i4_ctb_x - 1) + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ left_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+ }
+ /*If the 1st slice in a new tile is a dependent slice*/
+ if(!((ps_bs_ctxt->ps_slice_hdr->i1_dependent_slice_flag == 1) && (slice_idx == left_slice_idx)))
+ {
+ if((0 == i1_loop_filter_across_slices_enabled_flag && (
+ (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) || (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_tile_x) ||
+ ((0 == ps_bs_ctxt->i4_ctb_tile_x) && (slice_idx != left_slice_idx)))) ||
+ (0 == ps_bs_ctxt->i4_ctb_x))
+ {
+ pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ }
+ }
+ }
+
+ ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+ if(ps_bs_ctxt->i4_ctb_y)
+ {
+ ctb_addr = (ps_bs_ctxt->i4_ctb_x) + ((ps_bs_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+ top_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+ }
+
+ /* If top neighbor is not available, then set BS for entire first row to zero */
+ if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_y)
+ || (0 == i1_loop_filter_across_slices_enabled_flag && ((0 == ps_bs_ctxt->i4_ctb_slice_y) || (slice_idx != top_slice_idx)))
+ || (0 == ps_bs_ctxt->i4_ctb_y))
+ {
+ pu4_horz_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ }
+
+ /**
+ * Set BS of bottom and right frame boundaries to zero if it is an incomplete CTB
+ * (They might have been set to non zero values because of CBF of the current CTB)
+ * This block might not be needed for I slices*/
+ {
+ WORD32 num_rows_remaining = (ps_sps->i2_pic_height_in_luma_samples - (ps_bs_ctxt->i4_ctb_y << log2_ctb_size)) >> 3;
+ WORD32 num_cols_remaining = (ps_sps->i2_pic_width_in_luma_samples - (ps_bs_ctxt->i4_ctb_x << log2_ctb_size)) >> 3;
+ if(num_rows_remaining < (ctb_size >> 3))
+ {
+ /* WORD32 offset = (((num_rows_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ WORD32 offset;
+ offset = (num_rows_remaining >> (6 - log2_ctb_size)) << 2;
+ if(6 != log2_ctb_size)
+ offset += (num_rows_remaining & 1) << (log2_ctb_size - 4);
+
+ memset(((UWORD8 *)pu4_horz_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+ }
+
+ if(num_cols_remaining < (ctb_size >> 3))
+ {
+ /* WORD32 offset = (((num_cols_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+
+ WORD32 offset;
+ offset = (num_cols_remaining >> (6 - log2_ctb_size)) << 2;
+ if(6 != log2_ctb_size)
+ offset += (num_cols_remaining & 1) << (log2_ctb_size - 4);
+
+ memset(((UWORD8 *)pu4_vert_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+ }
+ }
+
+ return 0;
+}
+WORD32 ihevcd_ctb_boundary_strength_pbslice(bs_ctxt_t *ps_bs_ctxt)
+{
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ WORD32 cur_ctb_idx, next_ctb_idx = 0;
+ WORD32 i4_tu_cnt;
+ WORD32 i4_pu_cnt;
+ tu_t *ps_tu;
+
+ UWORD32 *pu4_vert_bs;
+ UWORD32 *pu4_horz_bs;
+ WORD32 vert_bs_strd;
+ WORD32 horz_bs_strd;
+ WORD32 vert_bs0_tmp;
+ WORD32 horz_bs0_tmp;
+ UWORD8 *pu1_qp;
+ WORD32 qp_strd;
+ UWORD32 u4_qp_const_in_ctb;
+ WORD32 ctb_indx;
+ WORD32 log2_ctb_size;
+ WORD32 ctb_size;
+
+ WORD32 i;
+ WORD8 i1_loop_filter_across_tiles_enabled_flag;
+ WORD8 i1_loop_filter_across_slices_enabled_flag;
+
+ PROFILE_DISABLE_BOUNDARY_STRENGTH();
+
+ ps_sps = ps_bs_ctxt->ps_sps;
+ ps_pps = ps_bs_ctxt->ps_pps;
+
+ log2_ctb_size = ps_sps->i1_log2_ctb_size;
+ ctb_size = (1 << log2_ctb_size);
+
+ /* strides are in units of number of bytes */
+ /* ctb_size * ctb_size / 8 / 16 is the number of bytes needed per CTB */
+ vert_bs_strd = ps_sps->i2_pic_wd_in_ctb << (2 * log2_ctb_size - 7);
+ horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) << (2 * log2_ctb_size - 7);
+ pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_vert_bs +
+ (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+ ps_bs_ctxt->i4_ctb_y * vert_bs_strd);
+ pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_bs_ctxt->pu4_pic_horz_bs +
+ (ps_bs_ctxt->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+ ps_bs_ctxt->i4_ctb_y * horz_bs_strd);
+
+ vert_bs0_tmp = pu4_vert_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+ horz_bs0_tmp = pu4_horz_bs[0] & (0xFFFFFFFF >> (sizeof(UWORD32) * 8 - ctb_size / 2));
+
+ ps_tu = ps_bs_ctxt->ps_tu;
+
+ /* ctb_size/8 elements per CTB */
+ qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3);
+ pu1_qp = ps_bs_ctxt->pu1_pic_qp + ((ps_bs_ctxt->i4_ctb_x + ps_bs_ctxt->i4_ctb_y * qp_strd) << (log2_ctb_size - 3));
+
+ ctb_indx = ps_bs_ctxt->i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_bs_ctxt->i4_ctb_y;
+ u4_qp_const_in_ctb = ps_bs_ctxt->pu1_pic_qp_const_in_ctb[ctb_indx >> 3] & (1 << (ctb_indx & 7));
+
+ i1_loop_filter_across_tiles_enabled_flag = ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+ i1_loop_filter_across_slices_enabled_flag = ps_bs_ctxt->ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag;
+
+ /* ctb_size/8 is the number of edges per CTB
+ * ctb_size/4 is the number of BS values needed per edge
+ * divided by 8 for the number of bytes
+ * 2 is the number of bits needed for each BS value */
+/*
+ memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) * 2 / 8 );
+ memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) * 2 / 8 );
+*/
+ memset(pu4_vert_bs, 0, (1 << (2 * log2_ctb_size - 7)) + (ctb_size >> 4));
+ memset(pu4_horz_bs, 0, (1 << (2 * log2_ctb_size - 7)));
+
+ /* pu4_vert_bs[0] has information about the left CTB which is not required when ctb_x = 0 */
+ if(0 != ps_bs_ctxt->i4_ctb_x)
+ {
+ pu4_vert_bs[0] |= vert_bs0_tmp;
+ }
+
+ /* pu4_horz_bs[0] has information about the top CTB which is not required when ctb_y = 0 */
+ if(0 != ps_bs_ctxt->i4_ctb_y)
+ {
+ pu4_horz_bs[0] |= horz_bs0_tmp;
+ }
+ /* pu4_horz_bs[horz_bs_strd / 4] corresponds to pu4_horz_bs[0] of the bottom CTB */
+ *(UWORD32 *)((UWORD8 *)pu4_horz_bs + horz_bs_strd) = 0;
+
+ cur_ctb_idx = ps_bs_ctxt->i4_ctb_x
+ + ps_bs_ctxt->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ next_ctb_idx = ps_bs_ctxt->i4_next_tu_ctb_cnt;
+ if(1 == ps_bs_ctxt->ps_codec->i4_num_cores)
+ {
+ i4_tu_cnt = ps_bs_ctxt->pu4_pic_tu_idx[next_ctb_idx] - ps_bs_ctxt->pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+ }
+ else
+ {
+ i4_tu_cnt = ps_bs_ctxt->pu4_pic_tu_idx[next_ctb_idx] - ps_bs_ctxt->pu4_pic_tu_idx[cur_ctb_idx];
+ }
+
+ ps_tu = ps_bs_ctxt->ps_tu;
+ if(u4_qp_const_in_ctb)
+ pu1_qp[0] = ps_tu->b7_qp;
+
+ /* For all TUs in the CTB For left and top edges, check if there are coded coefficients on either sides of the edge */
+ for(i = 0; i < i4_tu_cnt; i++)
+ {
+ WORD32 start_pos_x;
+ WORD32 start_pos_y;
+ WORD32 end_pos_x;
+ WORD32 end_pos_y;
+ WORD32 tu_size;
+ UWORD32 u4_bs;
+ WORD32 intra_flag;
+ UWORD8 *pu1_pic_intra_flag;
+
+ ps_tu = ps_bs_ctxt->ps_tu + i;
+
+ start_pos_x = ps_tu->b4_pos_x;
+ start_pos_y = ps_tu->b4_pos_y;
+
+ tu_size = 1 << (ps_tu->b3_size + 2);
+ tu_size >>= 2;
+
+ end_pos_x = start_pos_x + tu_size;
+ end_pos_y = start_pos_y + tu_size;
+
+ {
+ WORD32 tu_abs_x = (ps_bs_ctxt->i4_ctb_x << log2_ctb_size) + (start_pos_x << 2);
+ WORD32 tu_abs_y = (ps_bs_ctxt->i4_ctb_y << log2_ctb_size) + (start_pos_y << 2);
+
+ WORD32 numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+
+ pu1_pic_intra_flag = ps_bs_ctxt->ps_codec->pu1_pic_intra_flag;
+ pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
+ pu1_pic_intra_flag += (tu_abs_x >> 6);
+
+ intra_flag = *pu1_pic_intra_flag;
+ intra_flag &= (1 << ((tu_abs_x >> 3) % 8));
+ }
+ if(intra_flag)
+ {
+ u4_bs = DUP_LSB_10(tu_size);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_x & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_y * 2;
+ /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+ pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_y & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_x * 2;
+ /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+ pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+
+
+ /* If the current TU is coded then set both top edge and left edge BS to 1 and go to next TU */
+ if(ps_tu->b1_y_cbf)
+ {
+ u4_bs = DUP_LSB_01(tu_size);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_x & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_y * 2;
+ /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+ pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_y & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_x * 2;
+ /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+ pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (end_pos_x & 1))
+ {
+ if(!(ctb_size / 8 == (end_pos_x >> 1) && ps_bs_ctxt->i4_ctb_x == ps_sps->i2_pic_wd_in_ctb - 1))
+ {
+ WORD32 shift;
+ shift = start_pos_y * 2;
+ shift += (((end_pos_x >> 1) & ((MAX_CTB_SIZE >> log2_ctb_size) - 1)) << (log2_ctb_size - 1));
+ pu4_vert_bs[end_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (end_pos_y & 1))
+ {
+ /* If end_pos_y corresponds to the bottom of the CTB, write to pu4_horz_bs[0] of the bottom CTB */
+ if(ctb_size / 8 == (end_pos_y >> 1))
+ {
+ *(UWORD32 *)((UWORD8 *)pu4_horz_bs + horz_bs_strd) |= (u4_bs << (start_pos_x * 2));
+ }
+ else
+ {
+ WORD32 shift;
+ shift = start_pos_x * 2;
+ shift += (((end_pos_y >> 1) & ((MAX_CTB_SIZE >> log2_ctb_size) - 1)) << (log2_ctb_size - 1));
+ pu4_horz_bs[end_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+ }
+
+ if(0 == u4_qp_const_in_ctb)
+ {
+ if(0 == (start_pos_x & 1) && 0 == (start_pos_y & 1))
+ {
+ WORD32 row, col;
+ for(row = start_pos_y; row < start_pos_y + tu_size; row += 2)
+ {
+ for(col = start_pos_x; col < start_pos_x + tu_size; col += 2)
+ {
+ pu1_qp[(row >> 1) * qp_strd + (col >> 1)] = ps_tu->b7_qp;
+ }
+ }
+ }
+ }
+ }
+
+ /* For all PUs in the CTB,
+ For left and top edges, compute BS */
+
+ cur_ctb_idx = ps_bs_ctxt->i4_ctb_x
+ + ps_bs_ctxt->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ {
+ WORD32 next_ctb_idx;
+ next_ctb_idx = ps_bs_ctxt->i4_next_pu_ctb_cnt;
+ i4_pu_cnt = ps_bs_ctxt->pu4_pic_pu_idx[next_ctb_idx] - ps_bs_ctxt->pu4_pic_pu_idx[cur_ctb_idx];
+ }
+
+ for(i = 0; i < i4_pu_cnt; i++)
+ {
+ WORD32 start_pos_x;
+ WORD32 start_pos_y;
+ WORD32 end_pos_x;
+ WORD32 end_pos_y;
+ WORD32 pu_wd, pu_ht;
+ UWORD32 u4_bs;
+ pu_t *ps_pu = ps_bs_ctxt->ps_pu + i;
+ pu_t *ps_ngbr_pu;
+ UWORD32 u4_ngbr_pu_indx;
+
+ start_pos_x = ps_pu->b4_pos_x;
+ start_pos_y = ps_pu->b4_pos_y;
+
+ pu_wd = (ps_pu->b4_wd + 1);
+ pu_ht = (ps_pu->b4_ht + 1);
+
+ end_pos_x = start_pos_x + pu_wd;
+ end_pos_y = start_pos_y + pu_ht;
+
+ /* If the current PU is intra, set Boundary strength as 2 for both top and left edge */
+ /* Need not mask the BS to zero even if it was set to 1 already since BS 2 and 3 are assumed to be the same in leaf level functions */
+ if(ps_pu->b1_intra_flag)
+ {
+ u4_bs = DUP_LSB_10(pu_ht);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_x & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_y * 2;
+ /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+ pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+
+ u4_bs = DUP_LSB_10(pu_wd);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_y & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_x * 2;
+ /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+ pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+
+ else
+ {
+ /* Vertical edge */
+ /* Process only if the edge is not a frame edge */
+ if(0 != ps_bs_ctxt->i4_ctb_x + start_pos_x)
+ {
+ do
+ {
+ WORD32 pu_ngbr_ht;
+ WORD32 min_pu_ht;
+ WORD32 ngbr_end_pos_y;
+ UWORD32 ngbr_pu_idx_strd;
+ ngbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+ u4_ngbr_pu_indx = ps_bs_ctxt->pu4_pic_pu_idx_map[(start_pos_y + 1) * ngbr_pu_idx_strd + (start_pos_x)];
+ ps_ngbr_pu = ps_bs_ctxt->ps_pic_pu + u4_ngbr_pu_indx;
+
+ pu_ngbr_ht = ps_ngbr_pu->b4_ht + 1;
+ ngbr_end_pos_y = ps_ngbr_pu->b4_pos_y + pu_ngbr_ht;
+
+ min_pu_ht = MIN(ngbr_end_pos_y, end_pos_y) - start_pos_y;
+
+ if(ps_ngbr_pu->b1_intra_flag)
+ {
+ u4_bs = DUP_LSB_10(min_pu_ht);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_x & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_y * 2;
+ /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+ pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+ else
+ {
+ u4_bs = ihevcd_pu_boundary_strength(ps_pu, ps_ngbr_pu);
+ if(u4_bs)
+ {
+ u4_bs = DUP_LSB_01(min_pu_ht);
+ if(0 == (start_pos_x & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_y * 2;
+ /* shift += (((start_pos_x >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_x & 2) << (log2_ctb_size - 2));
+ pu4_vert_bs[start_pos_x >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+ }
+
+ pu_ht -= min_pu_ht;
+ start_pos_y += min_pu_ht;
+ }while(pu_ht > 0);
+
+ /* Reinitialising since the values are updated in the previous loop */
+ pu_ht = ps_pu->b4_ht + 1;
+ start_pos_y = ps_pu->b4_pos_y;
+ }
+
+ /* Horizontal edge */
+ /* Process only if the edge is not a frame edge */
+ if(0 != ps_bs_ctxt->i4_ctb_y + start_pos_y)
+ {
+ do
+ {
+ WORD32 pu_ngbr_wd;
+ WORD32 min_pu_wd;
+ WORD32 ngbr_end_pos_x;
+ UWORD32 ngbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+ u4_ngbr_pu_indx = ps_bs_ctxt->pu4_pic_pu_idx_map[(start_pos_y)*ngbr_pu_idx_strd + (start_pos_x + 1)];
+ ps_ngbr_pu = ps_bs_ctxt->ps_pic_pu + u4_ngbr_pu_indx;
+
+ pu_ngbr_wd = ps_ngbr_pu->b4_wd + 1;
+ ngbr_end_pos_x = ps_ngbr_pu->b4_pos_x + pu_ngbr_wd;
+
+ min_pu_wd = MIN(ngbr_end_pos_x, end_pos_x) - start_pos_x;
+
+ if(ps_ngbr_pu->b1_intra_flag)
+ {
+ u4_bs = DUP_LSB_10(min_pu_wd);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_y & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_x * 2;
+ /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+ pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+ else
+ {
+ u4_bs = ihevcd_pu_boundary_strength(ps_pu, ps_ngbr_pu);
+ if(u4_bs)
+ {
+ u4_bs = DUP_LSB_01(min_pu_wd);
+
+ /* Only if the current edge falls on 8 pixel grid set BS */
+ if(0 == (start_pos_y & 1))
+ {
+ WORD32 shift;
+ shift = start_pos_x * 2;
+ /* shift += (((start_pos_y >> 1) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 1));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift += ((start_pos_y & 2) << (log2_ctb_size - 2));
+ pu4_horz_bs[start_pos_y >> (7 - log2_ctb_size)] |= (u4_bs << shift);
+ }
+ }
+ }
+
+ pu_wd -= min_pu_wd;
+ start_pos_x += min_pu_wd;
+ }while(pu_wd > 0);
+
+ /* Reinitialising since the values are updated in the previous loop */
+ pu_wd = ps_pu->b4_wd + 1;
+ start_pos_x = ps_pu->b4_pos_x;
+ }
+ }
+ }
+
+ {
+ /* If left neighbor is not available, then set BS for entire first column to zero */
+ UWORD32 ctb_addr;
+ WORD32 slice_idx, left_slice_idx = -1, top_slice_idx = -1;
+
+ if(!ps_pps->i1_tiles_enabled_flag)
+ {
+ if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x) ||
+ (0 == i1_loop_filter_across_slices_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) ||
+ (0 == ps_bs_ctxt->i4_ctb_x))
+ {
+ pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ }
+ else
+ {
+ if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_x))
+ {
+ pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ else
+ {
+
+ ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+
+ if(ps_bs_ctxt->i4_ctb_x)
+ {
+ ctb_addr = (ps_bs_ctxt->i4_ctb_x - 1) + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ left_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+ }
+
+ if(!((ps_bs_ctxt->ps_slice_hdr->i1_dependent_slice_flag == 1) && (slice_idx == left_slice_idx)))
+ {
+ if((0 == i1_loop_filter_across_slices_enabled_flag && (
+ (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_slice_y) || (0 == ps_bs_ctxt->i4_ctb_slice_x && 0 == ps_bs_ctxt->i4_ctb_tile_x)
+ || ((0 == ps_bs_ctxt->i4_ctb_tile_x) && (slice_idx != left_slice_idx)))) || (0 == ps_bs_ctxt->i4_ctb_x))
+ {
+ pu4_vert_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ }
+ }
+ }
+
+ ctb_addr = ps_bs_ctxt->i4_ctb_x + (ps_bs_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+ if(ps_bs_ctxt->i4_ctb_y)
+ {
+ ctb_addr = (ps_bs_ctxt->i4_ctb_x) + ((ps_bs_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+ top_slice_idx = ps_bs_ctxt->pu1_slice_idx[ctb_addr];
+ }
+ /* If top neighbor is not available, then set BS for entire first row to zero */
+ if((0 == i1_loop_filter_across_tiles_enabled_flag && 0 == ps_bs_ctxt->i4_ctb_tile_y)
+ || (0 == i1_loop_filter_across_slices_enabled_flag && ((0 == ps_bs_ctxt->i4_ctb_slice_y) || (slice_idx != top_slice_idx)))
+ || (0 == ps_bs_ctxt->i4_ctb_y))
+ {
+ pu4_horz_bs[0] &= (64 == ctb_size) ? 0 : ((UWORD32)0xFFFFFFFF) << (ctb_size / 2);
+ }
+ }
+
+ /**
+ * Set BS of bottom and right frame boundaries to zero if it is an incomplete CTB
+ * (They might have set to non zero values because of CBF of the current CTB)*/
+ {
+ WORD32 num_rows_remaining = (ps_sps->i2_pic_height_in_luma_samples - (ps_bs_ctxt->i4_ctb_y << log2_ctb_size)) >> 3;
+ WORD32 num_cols_remaining = (ps_sps->i2_pic_width_in_luma_samples - (ps_bs_ctxt->i4_ctb_x << log2_ctb_size)) >> 3;
+ if(num_rows_remaining < (ctb_size >> 3))
+ {
+ /* WORD32 offset = (((num_rows_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ WORD32 offset;
+ offset = (num_rows_remaining >> (6 - log2_ctb_size)) << 2;
+ if(6 != log2_ctb_size)
+ offset += (num_rows_remaining & 1) << (log2_ctb_size - 4);
+
+ memset(((UWORD8 *)pu4_horz_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+ }
+
+ if(num_cols_remaining < (ctb_size >> 3))
+ {
+ /* WORD32 offset = (((num_cols_remaining >> 3) & (MAX_CTB_SIZE / ctb_size - 1)) << (log2_ctb_size - 4));
+ * will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+
+ WORD32 offset;
+ offset = (num_cols_remaining >> (6 - log2_ctb_size)) << 2;
+ if(6 != log2_ctb_size)
+ offset += (num_cols_remaining & 1) << (log2_ctb_size - 4);
+
+ memset(((UWORD8 *)pu4_vert_bs) + offset, 0, 1 << (log2_ctb_size - 4));
+ }
+ }
+ return 0;
+}
diff --git a/decoder/ihevcd_boundary_strength.h b/decoder/ihevcd_boundary_strength.h
new file mode 100644
index 0000000..c2f3e16
--- /dev/null
+++ b/decoder/ihevcd_boundary_strength.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_bitps_bitstrm.h
+*
+* @brief
+* Header for bitps_bitstrm access functions
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_BOUNDARY_STRENGTH_H_
+#define IHEVCD_BOUNDARY_STRENGTH_H_
+
+WORD32 ihevcd_ctb_boundary_strength_islice(bs_ctxt_t *ps_bs_ctxt);
+
+WORD32 ihevcd_ctb_boundary_strength_pbslice(bs_ctxt_t *ps_bs_ctxt);
+
+WORD32 ihevcd_pu_boundary_strength(pu_t *ps_pu,
+ pu_t *ps_ngbr_pu);
+
+
+
+#endif /* IHEVCD_BOUNDARY_STRENGTH_H_ */
diff --git a/decoder/ihevcd_cabac.c b/decoder/ihevcd_cabac.c
new file mode 100644
index 0000000..07e9e54
--- /dev/null
+++ b/decoder/ihevcd_cabac.c
@@ -0,0 +1,845 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ ******************************************************************************
+ * @file ihevcd_cabac.c
+ *
+ * @brief
+ * This file contains function definitions related to CABAC parsing
+ *
+ * @author
+ * Ittiam
+ *
+ *
+ * List of Functions
+ *
+ * ihevcd_cabac_init()
+ * ihevcd_cabac_decode_bin()
+ * ihevcd_cabac_decode_bypass_bin()
+ * ihevcd_cabac_decode_bypass_bins_tunary()
+ * ihevcd_cabac_decode_terminate()
+ * ihevcd_cabac_decode_bin_tunary()
+ * ihevcd_cabac_decode_bypass_bins()
+ * ihevcd_cabac_decode_bypass_bins_egk()
+ * ihevcd_cabac_decode_trunc_rice()
+ * ihevcd_cabac_flush()
+ *
+ ******************************************************************************
+ */
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+
+#include "ihevc_debug.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_cabac_tables.h"
+
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_cabac.h"
+#include "ihevcd_trace.h"
+
+#ifdef TRACE
+extern trace_t g_trace;
+#endif
+#if DEBUG_CABAC_RANGE_OFST
+#if FULLRANGE
+#define DEBUG_RANGE_OFST(str, m_range, m_ofst ) \
+{\
+ UWORD32 m_clz, m_range_shift, m_ofst_shift; \
+ m_clz = CLZ(m_range); \
+ m_clz -= (32 - RANGE_NUMBITS); \
+ m_range_shift = m_range << m_clz; \
+ m_range_shift = m_range_shift >> RANGE_SHIFT; \
+ m_ofst_shift = m_ofst << m_clz; \
+ m_ofst_shift = m_ofst_shift >> RANGE_SHIFT; \
+ fprintf( g_trace.fp, "%-40s R: %3d O: %3d\n", str, m_range_shift, m_ofst_shift); \
+}
+
+#else
+#define DEBUG_RANGE_OFST(str, m_range, m_ofst) \
+ fprintf( g_trace.fp, "%-40s R: %3d O: %3d\n", str, m_range, m_ofst);
+#endif
+#else
+#define DEBUG_RANGE_OFST(str, m_range, m_ofst )
+#endif
+/*****************************************************************************/
+/* Function Definitions */
+/*****************************************************************************/
+
+/**
+ ******************************************************************************
+ *
+ * @brief Initializes the decoder cabac engine
+ *
+ * @par Description
+ * This routine needs to be called at start of slice/frame decode
+ *
+ * @param[in,out] ps_cabac_ctxt
+ * pointer to cabac context (handle)
+ *
+ * @param[in] ps_bitstrm
+ * pointer to bitstream context (handle)
+ *
+ * @param[in] qp
+ * current slice Qp
+ *
+ * @param[in] cabac_init_idc
+ * current slice init idc (range [0 - 2])
+ *
+ * @param[in] pu1_init_ctxt
+ * Init cabac context to be used (range [0 - 2])
+ *
+ * @return success or failure error code
+ *
+ ******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_cabac_init(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 qp,
+ WORD32 cabac_init_idc,
+ const UWORD8 *pu1_init_ctxt)
+{
+ /* Sanity checks */
+ ASSERT(ps_cabac != NULL);
+ ASSERT(ps_bitstrm != NULL);
+ ASSERT((qp >= 0) && (qp < 52));
+ ASSERT((cabac_init_idc >= 0) && (cabac_init_idc < 3));
+ UNUSED(qp);
+ UNUSED(cabac_init_idc);
+ /* CABAC engine uses 32 bit range instead of 9 bits as specified by
+ * the spec. This is done to reduce number of renormalizations
+ */
+ /* cabac engine initialization */
+#if FULLRANGE
+ ps_cabac->u4_range = (UWORD32)510 << RANGE_SHIFT;
+ BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, (9 + RANGE_SHIFT));
+
+#else
+ ps_cabac->u4_range = (UWORD32)510;
+ BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, 9);
+
+#endif
+
+ /* cabac context initialization based on init idc and slice qp */
+ memcpy(ps_cabac->au1_ctxt_models,
+ pu1_init_ctxt,
+ IHEVC_CAB_CTXT_END);
+ DEBUG_RANGE_OFST("init", ps_cabac->u4_range, ps_cabac->u4_ofst);
+ return ((IHEVCD_ERROR_T)IHEVCD_SUCCESS);
+}
+
+IHEVCD_ERROR_T ihevcd_cabac_reset(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm)
+{
+ /* Sanity checks */
+ ASSERT(ps_cabac != NULL);
+ ASSERT(ps_bitstrm != NULL);
+
+ /* CABAC engine uses 32 bit range instead of 9 bits as specified by
+ * the spec. This is done to reduce number of renormalizations
+ */
+ /* cabac engine initialization */
+#if FULLRANGE
+ ps_cabac->u4_range = (UWORD32)510 << RANGE_SHIFT;
+ BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, (9 + RANGE_SHIFT));
+
+#else
+ ps_cabac->u4_range = (UWORD32)510;
+ BITS_GET(ps_cabac->u4_ofst, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, 9);
+
+#endif
+
+ return ((IHEVCD_ERROR_T)IHEVCD_SUCCESS);
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a bin based on probablilty and mps packed context model
+ *
+ * @par Description
+ * Decodes a bin as per Section : 9.3.3.2.1 and calls renormalization if required
+ * as per section 9.3.3.2.2
+ * 1. Apart from decoding bin, context model is updated as per state transition
+ * 2. Range and Low renormalization is done based on bin and original state
+ * 3. After renorm bistream is updated (if required)
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] ctxt_index
+ * index of cabac context model containing pState[bits6-1] | MPS[bit0]
+ *
+ * @param[in] ps_bitstrm
+ * Bitstream context
+ *
+ * @return bin(boolean) to be decoded
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bin(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 ctxt_index
+
+ )
+{
+ UWORD32 u4_range = ps_cabac->u4_range;
+ UWORD32 u4_ofst = ps_cabac->u4_ofst;
+ UWORD32 u4_rlps;
+ UWORD32 u4_bin;
+ UWORD8 *pu1_ctxt_model = &ps_cabac->au1_ctxt_models[ctxt_index];
+ WORD32 state_mps = *pu1_ctxt_model;
+#if FULLRANGE
+ WORD32 clz;
+#endif
+ UWORD32 u4_qnt_range;
+
+ /* Sanity checks */
+ ASSERT(u4_range >= 256);
+ ASSERT((ctxt_index >= 0) && (ctxt_index < IHEVC_CAB_CTXT_END));
+ ASSERT(state_mps < 128);
+#if FULLRANGE
+ clz = CLZ(u4_range);
+ clz -= (32 - RANGE_NUMBITS);
+ u4_qnt_range = u4_range << clz;
+ u4_qnt_range = (u4_qnt_range >> (RANGE_SHIFT + 6)) & 0x3;
+#else
+ u4_qnt_range = (u4_range >> 6) & 0x3;
+#endif
+ /* Get the lps range from LUT based on quantized range and state */
+ u4_rlps = gau1_ihevc_cabac_rlps[state_mps >> 1][u4_qnt_range];
+#if FULLRANGE
+ u4_rlps = u4_rlps << (RANGE_SHIFT - clz);
+#endif
+ u4_range -= u4_rlps;
+
+ u4_bin = state_mps & 1;
+
+ if(u4_ofst >= u4_range)
+ {
+ u4_bin = 1 - u4_bin;
+ u4_ofst -= u4_range;
+ u4_range = u4_rlps;
+ }
+
+ *pu1_ctxt_model = gau1_ihevc_next_state[(state_mps << 1) | u4_bin];
+
+ /*****************************************************************/
+ /* Re-normalization; calculate bits generated based on range(R) */
+ /*****************************************************************/
+ if(u4_range < (1 << 8))
+ {
+ UWORD32 u4_bits;
+ WORD32 numbits;
+ numbits = CLZ(u4_range);
+ numbits -= (32 - RANGE_NUMBITS);
+#if !FULLRANGE
+ numbits -= RANGE_SHIFT;
+#endif
+ BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbits);
+
+ u4_ofst <<= numbits;
+ u4_ofst |= u4_bits;
+ u4_range <<= numbits;
+
+ }
+ /* Update the cabac context */
+ ps_cabac->u4_range = u4_range;
+ ps_cabac->u4_ofst = u4_ofst;
+ DEBUG_RANGE_OFST("bin", ps_cabac->u4_range, ps_cabac->u4_ofst);
+
+ return (u4_bin);
+
+
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a bypass bin (equi-probable 0 / 1)
+ *
+ * @par Description
+ * Decodes a bypss bin as per Section : 9.3.3.2.3
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] ps_bitstrm
+ * Bitstream context
+ *
+ * @return Decoded bypass bin
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bin(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm)
+{
+
+ UWORD32 u4_bin;
+ UWORD32 u4_range = ps_cabac->u4_range;
+ UWORD32 u4_ofst = ps_cabac->u4_ofst;
+ UWORD32 u4_bits;
+
+ /* Sanity checks */
+ ASSERT(u4_range >= 256);
+
+ BIT_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word);
+
+ u4_ofst <<= 1;
+ u4_ofst |= u4_bits;
+
+ u4_bin = 0;
+ if(u4_ofst >= u4_range)
+ {
+ u4_bin = 1;
+ u4_ofst -= u4_range;
+ }
+
+ /* Update the cabac context */
+ ps_cabac->u4_ofst = u4_ofst;
+ DEBUG_RANGE_OFST("bypass end", ps_cabac->u4_range, ps_cabac->u4_ofst);
+ return (u4_bin);
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a terminate bin (1:terminate 0:do not terminate)
+ *
+ * @par Description
+ * Decodes a terminate bin to be called for end_of_slice_flag and pcm_flag
+ * as per Section : 9.3.3.2.4
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] ps_bitstrm
+ * Bitstream context
+ *
+ * @return Decoded Bin to indicate whether to terminate or not
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_terminate(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm)
+{
+ UWORD32 u4_range = ps_cabac->u4_range;
+ UWORD32 u4_ofst = ps_cabac->u4_ofst;
+ UWORD32 u4_bin;
+#if FULLRANGE
+ WORD32 clz;
+#endif
+ /* Sanity checks */
+ ASSERT(u4_range >= 256);
+#if FULLRANGE
+ clz = CLZ(u4_range);
+ clz -= (32 - RANGE_NUMBITS);
+ u4_range -= 2 << (RANGE_SHIFT - clz);
+#else
+ u4_range -= 2;
+#endif
+
+ if(u4_ofst >= u4_range)
+ {
+ u4_bin = 1;
+
+#if FULLRANGE
+ /* In case of FULLRANGE extra bits read earlier need to pushed back to the bitstream */
+ {
+ WORD32 clz;
+ WORD32 numbits;
+ clz = CLZ(ps_cabac->u4_range);
+
+ numbits = (32 - clz);
+ numbits -= 9;
+
+ ihevcd_bits_seek(ps_bitstrm, -numbits);
+ }
+#endif
+
+ }
+ else
+ {
+ u4_bin = 0;
+ }
+ if(0 == u4_bin)
+ {
+ UWORD32 u4_bits;
+ WORD32 numbits;
+ numbits = CLZ(u4_range);
+ numbits -= (32 - RANGE_NUMBITS);
+#if !FULLRANGE
+ numbits -= RANGE_SHIFT;
+#endif
+ /* Renormalize if required */
+ if(numbits)
+ {
+ BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbits);
+
+ u4_ofst <<= numbits;
+ u4_ofst |= u4_bits;
+ u4_range <<= numbits;
+ }
+ }
+ /* bits to be inserted in the bitstream */
+ ps_cabac->u4_range = u4_range;
+ ps_cabac->u4_ofst = u4_ofst;
+ DEBUG_RANGE_OFST("term", ps_cabac->u4_range, ps_cabac->u4_ofst);
+
+ return (u4_bin);
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a bypass bin (equi-probable 0 / 1)
+ *
+ * @par Description
+ * Decodes a bypss bin as per Section : 9.3.3.2.3
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] ps_bitstrm
+ * Bitstream context
+ *
+ * @param[in] numbins
+ * Number of bins to decoded
+ *
+ * @return Decoded bypass bin
+ *
+ * @remarks Tested only for numbins less than 17
+ *
+ ******************************************************************************
+ */
+
+UWORD32 ihevcd_cabac_decode_bypass_bins(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 numbins)
+{
+ UWORD32 u4_bins;
+
+
+ UWORD32 u4_range = ps_cabac->u4_range;
+ UWORD32 u4_ofst = ps_cabac->u4_ofst;
+ UWORD32 u4_bits;
+ ASSERT(u4_range >= 256);
+ ASSERT(numbins > 0);
+
+ /* Sanity checks */
+ ASSERT(numbins < 17);
+
+ u4_bins = 0;
+
+ BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbins);
+
+ do
+ {
+ UWORD32 u4_bit;
+ numbins--;
+ u4_bit = (u4_bits >> numbins) & 1;
+ u4_ofst <<= 1;
+ u4_ofst |= u4_bit;
+
+ u4_bins <<= 1;
+ if(u4_ofst >= u4_range)
+ {
+ u4_bins += 1;
+ u4_ofst -= u4_range;
+ }
+ }while(numbins);
+
+ /* Update the cabac context */
+ ps_cabac->u4_ofst = u4_ofst;
+ DEBUG_RANGE_OFST("bypass", ps_cabac->u4_range, ps_cabac->u4_ofst);
+ return (u4_bins);
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a truncated unary symbol associated with context model(s)
+ *
+ * @par Description
+ * Decodes symbols coded with TUnary binarization as per sec 9.3.2.2
+ * This is used for computing symbols like qp_delta,
+ * last_sig_coeff_prefix_x, last_sig_coeff_prefix_y.
+ *
+ * The context models associated with each bin is computed as :
+ * current bin context = "base context idx" + (bin_idx >> shift)
+ * where
+ * 1. "base context idx" is the base index for the syntax element
+ * 2. "bin_idx" is the current bin index of the unary code
+ * 3. "shift" is the shift factor associated with this syntax element
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] ps_bitstrm
+ * Bitstream context
+ *
+ * @param[in] c_max
+ * maximum value of sym (required for tunary binarization)
+ *
+ * @param[in] ctxt_index
+ * base context model index for this syntax element
+ *
+ * @param[in] ctxt_shift
+ * shift factor for context increments associated with this syntax element
+ *
+ * @param[in] ctxt_inc_max
+ * max value of context increment beyond which all bins will use same ctxt
+ *
+ * @return syntax element decoded
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bins_tunary(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 c_max,
+ WORD32 ctxt_index,
+ WORD32 ctxt_shift,
+ WORD32 ctxt_inc_max)
+{
+ UWORD32 u4_sym;
+ WORD32 bin;
+
+ /* Sanity checks */
+ ASSERT(c_max > 0);
+ ASSERT((ctxt_index >= 0) && (ctxt_index < IHEVC_CAB_CTXT_END));
+ ASSERT((ctxt_index + (c_max >> ctxt_shift)) < IHEVC_CAB_CTXT_END);
+
+ u4_sym = 0;
+ do
+ {
+ WORD32 bin_index;
+ bin_index = ctxt_index + MIN((u4_sym >> ctxt_shift), ctxt_inc_max);
+ IHEVCD_CABAC_DECODE_BIN(bin, ps_cabac, ps_bitstrm, bin_index);
+ u4_sym++;
+ }while(((WORD32)u4_sym < c_max) && bin);
+
+ u4_sym = u4_sym - 1 + bin;
+
+ return (u4_sym);
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a syntax element as truncated unary bypass bins
+ *
+ * @par Description
+ * Decodes symbols coded with TUnary binarization as per sec 9.3.2.2
+ * These symbols are coded as bypass bins
+ * This is used for computing symbols like merge_idx,
+ * mpm_idx etc
+ *
+ * @param[in,out]ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] ps_bitstrm
+ * Bitstream context
+ *
+ * @param[in] c_max
+ * maximum value of sym (required for tunary binarization)
+ *
+ * @return syntax element decoded
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bins_tunary(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 c_max)
+{
+
+ UWORD32 u4_sym;
+ WORD32 bin;
+ UWORD32 u4_ofst = ps_cabac->u4_ofst;
+ UWORD32 u4_range = ps_cabac->u4_range;
+ UWORD32 u4_bits;
+ /* Sanity checks */
+ ASSERT(c_max > 0);
+ ASSERT(u4_range >= 256);
+ u4_sym = 0;
+ BITS_NXT(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, (UWORD32)c_max);
+ u4_bits <<= (32 - c_max);
+ do
+ {
+ u4_ofst <<= 1;
+ u4_ofst |= (u4_bits >> 31);
+ u4_bits <<= 1;
+
+ bin = 0;
+ if(u4_ofst >= u4_range)
+ {
+ bin = 1;
+ u4_ofst -= u4_range;
+ }
+ u4_sym++;
+ }while(((WORD32)u4_sym < c_max) && bin);
+ BITS_FLUSH(ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst,
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, u4_sym);
+
+ u4_sym = u4_sym - 1 + bin;
+ /* Update the cabac context */
+ ps_cabac->u4_ofst = u4_ofst;
+
+ return (u4_sym);
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a syntax element as kth order Exp-Golomb code (EGK)
+ *
+ * @par Description
+ * Decodes a syntax element binarized as kth order Exp-Golomb code (EGK)
+ * Elements are coded as bypass bins
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] u4_sym
+ * syntax element to be coded as EGK
+ *
+ * @param[in] k
+ * order of EGk
+ *
+ * @return success or failure error code
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bins_egk(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 k)
+{
+
+ UWORD32 u4_sym;
+ WORD32 numones;
+ WORD32 bin;
+
+ /* Sanity checks */
+ ASSERT((k >= 0));
+
+ numones = k;
+ bin = 1;
+ u4_sym = 0;
+ while(bin)
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BIN(bin, ps_cabac, ps_bitstrm);
+ u4_sym += bin << numones++;
+ }
+
+ numones -= 1;
+ numones = CLIP3(numones, 0, 16);
+
+ if(numones)
+ {
+ UWORD32 u4_suffix;
+
+ IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, numones);
+ u4_sym += u4_suffix;
+ }
+ return (u4_sym);
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Decodes a syntax element as truncated rice code (TR)
+ *
+ * @par Description
+ * Decodes a syntax element as truncated rice code (TR)
+ * Elements are coded as bypass bins
+ * This function ise used for coeff_abs_level_remaining coding when
+ * level is less than c_rice_max
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @param[in] u4_sym
+ * syntax element to be coded as truncated rice code
+ *
+ * @param[in] c_rice_param
+ * shift factor for truncated unary prefix coding of (u4_sym >> c_rice_param)
+ *
+ * @param[in] c_rice_max
+ * max symbol val below which a suffix is coded as (u4_sym%(1<<c_rice_param))
+ * This is currently (4 << c_rice_param) for coeff_abs_level_remaining
+ *
+ * @return success or failure error code
+ *
+ ******************************************************************************
+ */
+UWORD32 ihevcd_cabac_decode_bypass_bins_trunc_rice(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 c_rice_param,
+ WORD32 c_rice_max)
+{
+ UWORD32 u4_sym;
+ WORD32 bin;
+ WORD32 c_max;
+ UWORD32 u4_suffix;
+ /* Sanity checks */
+ ASSERT((c_rice_param >= 0));
+
+
+ /* Decode prefix coded as TUnary */
+ c_max = c_rice_max >> c_rice_param;
+ u4_sym = 0;
+ do
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BIN(bin, ps_cabac, ps_bitstrm);
+ u4_sym++;
+
+ }while(((WORD32)u4_sym < c_max) && bin);
+ u4_sym = u4_sym - 1 + bin;
+
+ /* If suffix is present, then decode c_rice_param number of bins */
+ if(c_rice_param)
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, c_rice_param);
+
+ u4_sym = (u4_sym << c_rice_param) | u4_suffix;
+ }
+ return (u4_sym);
+}
+#if 0
+/**
+ ******************************************************************************
+ *
+ * @brief Flushes the cabac decoder engine as per section 9.3.4 figure 9-12
+ *
+ * @par Description
+ *
+ *
+ * @param[in,out] ps_cabac
+ * pointer to cabac context (handle)
+ *
+ * @return success or failure error code
+ *
+ ******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_cabac_flush(cab_ctxt_t *ps_cabac)
+{
+ UWORD32 u4_ofst = ps_cabac->u4_ofst;
+ UWORD32 u4_bits_gen = ps_cabac->u4_bits_gen;
+
+ UWORD8 *pu1_strm_buf = ps_cabac->pu1_strm_buffer;
+ UWORD32 u4_strm_buf_offset = ps_cabac->u4_strm_buf_offset;
+ WORD32 zero_run = ps_cabac->i4_zero_bytes_run;
+ UWORD32 u4_out_standing_bytes = ps_cabac->u4_out_standing_bytes;
+
+ /************************************************************************/
+ /* Insert the carry (propogated in previous byte) along with */
+ /* outstanding bytes (if any) and flush remaining bits */
+ /************************************************************************/
+ //TODO: Review this function
+ {
+ /* carry = 1 => putbit(1); carry propogated due to L renorm */
+ WORD32 carry = (u4_ofst >> (u4_bits_gen + CABAC_BITS)) & 0x1;
+ WORD32 last_byte;
+ WORD32 bits_left;
+ WORD32 rem_bits;
+
+ /*********************************************************************/
+ /* Bitstream overflow check */
+ /* NOTE: corner case of epb bytes (max 2 for 32bit word) not handled */
+ /*********************************************************************/
+ if((u4_strm_buf_offset + u4_out_standing_bytes + 1)
+ >= ps_cabac->u4_max_strm_size)
+ {
+ /* return without corrupting the buffer beyond its size */
+ return (IHEVCD_BITSTREAM_BUFFER_OVERFLOW);
+ }
+
+ if(carry)
+ {
+ /* previous byte carry add will not result in overflow to */
+ /* u4_strm_buf_offset - 2 as we track 0xff as outstanding bytes */
+ pu1_strm_buf[u4_strm_buf_offset - 1] += carry;
+ zero_run = 0;
+ }
+
+ /* Insert outstanding bytes (if any) */
+ while(u4_out_standing_bytes)
+ {
+ UWORD8 u1_0_or_ff = carry ? 0 : 0xFF;
+
+ PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, u1_0_or_ff, zero_run);
+
+ u4_out_standing_bytes--;
+ }
+
+ /* clear the carry in low */
+ u4_ofst &= ((1 << (u4_bits_gen + CABAC_BITS)) - 1);
+
+ /* extract the remaining bits; */
+ /* includes addtitional msb 2 bits of low as per Figure 9-12 */
+ bits_left = u4_bits_gen + 2;
+ rem_bits = (u4_ofst >> (u4_bits_gen + CABAC_BITS - bits_left));
+
+ if(bits_left >= 8)
+ {
+ last_byte = (rem_bits >> (bits_left - 8)) & 0xFF;
+ PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, last_byte, zero_run);
+ bits_left -= 8;
+ }
+
+ /* insert last byte along with rbsp stop bit(1) and 0's in the end */
+ last_byte = (rem_bits << (8 - bits_left)) | (1 << (bits_left - 1));
+ last_byte &= 0xFF;
+ PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, last_byte, zero_run);
+
+ /* update the state variables and return success */
+ ps_cabac->u4_strm_buf_offset = u4_strm_buf_offset;
+ ps_cabac->i4_zero_bytes_run = zero_run;
+ return (IHEVCD_SUCCESS);
+ }
+}
+#endif
diff --git a/decoder/ihevcd_cabac.h b/decoder/ihevcd_cabac.h
new file mode 100644
index 0000000..2c4a543
--- /dev/null
+++ b/decoder/ihevcd_cabac.h
@@ -0,0 +1,286 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+******************************************************************************
+* @file ihevcd_cabac.h
+*
+* @brief
+* This file contains decoder cabac engine related structures and
+* interface prototypes
+*
+* @author
+* Ittiam
+******************************************************************************
+*/
+
+#ifndef _IHEVCD_CABAC_H_
+#define _IHEVCD_CABAC_H_
+
+#include "ihevc_typedefs.h"
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+/**
+*******************************************************************************
+@brief
+*******************************************************************************
+ */
+#define CABAC_BITS 9
+
+/**
+ * Following definitions control whether cabac functions are inlined as macros or
+ * are called as functions. Set these to 0 to debug cabac leaf level functions
+ * Note these macros assume FULLRANGE is 1.
+ */
+#define CABAC_DECODE_BIN 1
+#define CABAC_DECODE_BYPASS_BIN 1
+#define CABAC_DECODE_BYPASS_BINS 1
+
+/*****************************************************************************/
+/* Function Macros */
+/*****************************************************************************/
+#if CABAC_DECODE_BIN
+#define IHEVCD_CABAC_DECODE_BIN(u4_bin, ps_cabac, ps_bitstrm, ctxt_index) \
+{ \
+ UWORD32 u4_range = ps_cabac->u4_range; \
+ UWORD32 u4_ofst = ps_cabac->u4_ofst; \
+ UWORD32 u4_rlps; \
+ UWORD8 *pu1_ctxt_model = &ps_cabac->au1_ctxt_models[ctxt_index]; \
+ WORD32 state_mps = *pu1_ctxt_model; \
+ WORD32 clz; \
+ UWORD32 u4_qnt_range; \
+ \
+ /* Sanity checks */ \
+ ASSERT(FULLRANGE == 1); \
+ ASSERT(u4_range >= 256); \
+ ASSERT((ctxt_index >= 0) && (ctxt_index < IHEVC_CAB_CTXT_END)); \
+ ASSERT(state_mps < 128); \
+ clz = CLZ(u4_range); \
+ clz -= (32 - RANGE_NUMBITS); \
+ u4_qnt_range = u4_range << clz; \
+ u4_qnt_range = (u4_qnt_range >> (RANGE_SHIFT + 6)) & 0x3; \
+ /* Get the lps range from LUT based on quantized range and state */ \
+ u4_rlps = gau1_ihevc_cabac_rlps[state_mps >> 1][u4_qnt_range]; \
+ u4_rlps = u4_rlps << (RANGE_SHIFT - clz); \
+ u4_range -= u4_rlps; \
+ \
+ u4_bin = state_mps & 1; \
+ \
+ if(u4_ofst >= u4_range) \
+ { \
+ u4_bin = 1 - u4_bin; \
+ u4_ofst -= u4_range; \
+ u4_range = u4_rlps; \
+ } \
+ \
+ *pu1_ctxt_model = gau1_ihevc_next_state[(state_mps << 1) | u4_bin]; \
+ \
+ /*****************************************************************/ \
+ /* Re-normalization; calculate bits generated based on range(R) */ \
+ /*****************************************************************/ \
+ if(u4_range < (1 << 8)) \
+ { \
+ UWORD32 u4_bits; \
+ WORD32 numbits; \
+ numbits = CLZ(u4_range); \
+ numbits -= (32 - RANGE_NUMBITS); \
+ BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst, \
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbits); \
+ \
+ u4_ofst <<= numbits; \
+ u4_ofst |= u4_bits; \
+ u4_range <<= numbits; \
+ \
+ } \
+ /* Update the cabac context */ \
+ ps_cabac->u4_range = u4_range; \
+ ps_cabac->u4_ofst = u4_ofst; \
+ \
+}
+#else
+#define IHEVCD_CABAC_DECODE_BIN(u4_bin, ps_cabac, ps_bitstrm, ctxt_index) \
+ u4_bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_index);
+#endif
+
+#if CABAC_DECODE_BYPASS_BIN
+#define IHEVCD_CABAC_DECODE_BYPASS_BIN(u4_bin, ps_cabac, ps_bitstrm) \
+{ \
+ \
+ UWORD32 u4_range = ps_cabac->u4_range; \
+ UWORD32 u4_ofst = ps_cabac->u4_ofst; \
+ UWORD32 u4_bits; \
+ \
+ /* Sanity checks */ \
+ ASSERT(FULLRANGE == 1); \
+ ASSERT(u4_range >= 256); \
+ \
+ BIT_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst, \
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word); \
+ \
+ u4_ofst <<= 1; \
+ u4_ofst |= u4_bits; \
+ \
+ u4_bin = 0; \
+ if(u4_ofst >= u4_range) \
+ { \
+ u4_bin = 1; \
+ u4_ofst -= u4_range; \
+ } \
+ \
+ /* Update the cabac context */ \
+ ps_cabac->u4_ofst = u4_ofst; \
+}
+#else
+
+#define IHEVCD_CABAC_DECODE_BYPASS_BIN(u4_bin, ps_cabac, ps_bitstrm) \
+ u4_bin = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+#endif
+
+#if CABAC_DECODE_BYPASS_BINS
+#define IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_bins, ps_cabac, ps_bitstrm, numbins) \
+{ \
+ UWORD32 u4_range = ps_cabac->u4_range; \
+ UWORD32 u4_ofst = ps_cabac->u4_ofst; \
+ UWORD32 u4_bits; \
+ ASSERT(FULLRANGE == 1); \
+ ASSERT(u4_range >= 256); \
+ ASSERT(numbins > 0); \
+ { \
+ WORD32 numbins_tmp = numbins; \
+ /* Sanity checks */ \
+ ASSERT(numbins < 17); \
+ \
+ u4_bins = 0; \
+ \
+ BITS_GET(u4_bits, ps_bitstrm->pu4_buf, ps_bitstrm->u4_bit_ofst, \
+ ps_bitstrm->u4_cur_word, ps_bitstrm->u4_nxt_word, numbins); \
+ do \
+ { \
+ UWORD32 u4_bit; \
+ numbins_tmp--; \
+ u4_bit = (u4_bits >> numbins_tmp) & 1; \
+ u4_ofst <<= 1; \
+ u4_ofst |= u4_bit; \
+ \
+ u4_bins <<= 1; \
+ if(u4_ofst >= u4_range) \
+ { \
+ u4_bins += 1; \
+ u4_ofst -= u4_range; \
+ } \
+ }while(numbins_tmp); \
+ \
+ /* Update the cabac context */ \
+ ps_cabac->u4_ofst = u4_ofst; \
+ } \
+}
+
+
+#else
+
+#define IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_bins, ps_cabac, ps_bitstrm, numbins) \
+ u4_bins = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, numbins);
+
+#endif
+/*****************************************************************************/
+/* Structures */
+/*****************************************************************************/
+
+
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+IHEVCD_ERROR_T ihevcd_cabac_init
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 slice_qp,
+ WORD32 cabac_init_idc,
+ const UWORD8 *pu1_init_ctxt
+);
+
+
+
+UWORD32 ihevcd_cabac_decode_bin
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 ctxt_index
+);
+
+UWORD32 ihevcd_cabac_decode_bypass_bin
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm
+);
+
+UWORD32 ihevcd_cabac_decode_terminate
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm
+);
+
+UWORD32 ihevcd_cabac_decode_bypass_bins
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 num_bins
+);
+
+UWORD32 ihevcd_cabac_decode_bins_tunary
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 c_max,
+ WORD32 ctxt_index,
+ WORD32 ctxt_shift,
+ WORD32 ctxt_inc_max
+
+);
+
+UWORD32 ihevcd_cabac_decode_bypass_bins_tunary
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 c_max
+
+);
+
+UWORD32 ihevcd_cabac_decode_bypass_bins_egk
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 k
+);
+
+UWORD32 ihevcd_cabac_decode_bypass_bins_trunc_rice
+(
+ cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm,
+ WORD32 c_rice_param,
+ WORD32 c_rice_max
+);
+
+IHEVCD_ERROR_T ihevcd_cabac_flush(cab_ctxt_t *ps_cabac);
+
+IHEVCD_ERROR_T ihevcd_cabac_reset(cab_ctxt_t *ps_cabac,
+ bitstrm_t *ps_bitstrm);
+
+#endif /* _IHEVCD_CABAC_H_ */
diff --git a/decoder/ihevcd_common_tables.c b/decoder/ihevcd_common_tables.c
new file mode 100644
index 0000000..1f6065b
--- /dev/null
+++ b/decoder/ihevcd_common_tables.c
@@ -0,0 +1,49 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_common_tables.c
+ *
+ * @brief
+ * Contains common global tables for decoder
+ *
+ * @author
+ * Naveen S R
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include "ihevc_typedefs.h"
+#include "ihevcd_common_tables.h"
+#include "ihevc_defs.h"
+
+WORD16 gai2_ihevcd_chroma_qp[] =
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29,
+ 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 };
+
+const UWORD8 gau1_intra_pred_chroma_modes[] =
+ { INTRA_PLANAR, INTRA_ANGULAR(26), INTRA_ANGULAR(10), INTRA_DC };
+
diff --git a/decoder/ihevcd_common_tables.h b/decoder/ihevcd_common_tables.h
new file mode 100644
index 0000000..61bc93f
--- /dev/null
+++ b/decoder/ihevcd_common_tables.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/* */
+/* File Name : ihevcd_common_tables.h */
+/* */
+/* Description : Common tables */
+/* */
+/* List of Functions : None */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 07 09 2012 Harish Initial Version */
+/*****************************************************************************/
+
+#ifndef _IHEVCD_COMMON_TABLES_H_
+#define _IHEVCD_COMMON_TABLES_H_
+
+extern WORD16 gai2_ihevcd_chroma_qp[];
+
+extern const UWORD8 gau1_intra_pred_chroma_modes[];
+
+
+#endif /*_IHEVCD_COMMON_TABLES_H_*/
diff --git a/decoder/ihevcd_cxa.h b/decoder/ihevcd_cxa.h
new file mode 100644
index 0000000..be241c0
--- /dev/null
+++ b/decoder/ihevcd_cxa.h
@@ -0,0 +1,1098 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_cxa.h
+*
+* @brief
+* This file contains all the necessary structure and enumeration
+* definitions needed for the Application Program Interface(API) of the
+* Ittiam HEVC decoder on Cortex Ax
+*
+* @author
+* Harish
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef __IHEVCD_CXA_H__
+#define __IHEVCD_CXA_H__
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "iv.h"
+#include "ivd.h"
+
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Function Macros */
+/*****************************************************************************/
+#define IS_IVD_CONCEALMENT_APPLIED(x) (x & (1 << IVD_APPLIEDCONCEALMENT))
+#define IS_IVD_INSUFFICIENTDATA_ERROR(x) (x & (1 << IVD_INSUFFICIENTDATA))
+#define IS_IVD_CORRUPTEDDATA_ERROR(x) (x & (1 << IVD_CORRUPTEDDATA))
+#define IS_IVD_CORRUPTEDHEADER_ERROR(x) (x & (1 << IVD_CORRUPTEDHEADER))
+#define IS_IVD_UNSUPPORTEDINPUT_ERROR(x) (x & (1 << IVD_UNSUPPORTEDINPUT))
+#define IS_IVD_UNSUPPORTEDPARAM_ERROR(x) (x & (1 << IVD_UNSUPPORTEDPARAM))
+#define IS_IVD_FATAL_ERROR(x) (x & (1 << IVD_FATALERROR))
+#define IS_IVD_INVALID_BITSTREAM_ERROR(x) (x & (1 << IVD_INVALID_BITSTREAM))
+#define IS_IVD_INCOMPLETE_BITSTREAM_ERROR(x) (x & (1 << IVD_INCOMPLETE_BITSTREAM))
+
+
+/*****************************************************************************/
+/* API Function Prototype */
+/*****************************************************************************/
+IV_API_CALL_STATUS_T ihevcd_cxa_api_function(iv_obj_t *ps_handle,
+ void *pv_api_ip,
+ void *pv_api_op);
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+/* Codec Error codes for HEVC Decoder */
+
+
+typedef enum {
+ /**
+ * No error
+ */
+ IHEVCD_SUCCESS = 0,
+
+ /**
+ * Codec calls done without successful init
+ */
+ IHEVCD_INIT_NOT_DONE = IVD_DUMMY_ELEMENT_FOR_CODEC_EXTENSIONS,
+
+
+ IHEVCD_CXA_VID_HDR_DEC_NUM_FRM_BUF_NOT_SUFFICIENT,
+
+ /**
+ * Unsupported level passed as an argument
+ */
+ IHEVCD_LEVEL_UNSUPPORTED,
+ /**
+ * Unsupported number of reference pictures passed as an argument
+ */
+ IHEVCD_NUM_REF_UNSUPPORTED,
+ /**
+ * Unsupported number of reorder pictures passed as an argument
+ */
+ IHEVCD_NUM_REORDER_UNSUPPORTED,
+ /**
+ * Unsupported number of extra display pictures passed as an argument
+ */
+ IHEVCD_NUM_EXTRA_DISP_UNSUPPORTED,
+ /**
+ * Invalid display stride requested.
+ */
+ IHEVCD_INVALID_DISP_STRD,
+
+ /**
+ * Reached end of sequence
+ */
+ IHEVCD_END_OF_SEQUENCE,
+
+ /**
+ * Width/height greater than max width and max height
+ */
+ IHEVCD_UNSUPPORTED_DIMENSIONS,
+
+ /**
+ * Buffer size to hold version string is not sufficient
+ * Allocate more to hold version string
+ */
+ IHEVCD_CXA_VERS_BUF_INSUFFICIENT,
+ /**
+ * Stream chroma format other than YUV420
+ */
+ IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC,
+
+ /**
+ * Generic failure
+ */
+ IHEVCD_FAIL = 0x7FFFFFFF
+
+
+}IHEVCD_CXA_ERROR_CODES_T;
+
+/*****************************************************************************/
+/* Extended Structures */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Get Number of Memory Records */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_num_mem_rec_ip_t
+ */
+ iv_num_mem_rec_ip_t s_ivd_num_mem_rec_ip_t;
+}ihevcd_cxa_num_mem_rec_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_num_mem_rec_op_t
+ */
+ iv_num_mem_rec_op_t s_ivd_num_mem_rec_op_t;
+}ihevcd_cxa_num_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/* Fill Memory Records */
+/*****************************************************************************/
+
+
+typedef struct {
+ /**
+ * ivd_fill_mem_rec_ip_t
+ */
+ iv_fill_mem_rec_ip_t s_ivd_fill_mem_rec_ip_t;
+
+ /**
+ * level
+ */
+ WORD32 i4_level;
+
+ /**
+ * num_reorder_frames
+ */
+ UWORD32 u4_num_reorder_frames;
+
+ /**
+ * num_ref_frames
+ */
+ UWORD32 u4_num_ref_frames;
+
+ /**
+ * share_disp_buf
+ */
+ UWORD32 u4_share_disp_buf;
+
+ /**
+ * format in which codec has to give out frame data for display
+ */
+ IV_COLOR_FORMAT_T e_output_format;
+
+ /**
+ * Number of extra display buffers that will be allocated to handle display pipeline depth
+ */
+ UWORD32 u4_num_extra_disp_buf;
+
+}ihevcd_cxa_fill_mem_rec_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_fill_mem_rec_op_t
+ */
+
+ iv_fill_mem_rec_op_t s_ivd_fill_mem_rec_op_t;
+
+}ihevcd_cxa_fill_mem_rec_op_t;
+
+/*****************************************************************************/
+/* Retrieve Memory Records */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_retrieve_mem_rec_ip_t
+ */
+ iv_retrieve_mem_rec_ip_t s_ivd_retrieve_mem_rec_ip_t;
+}ihevcd_cxa_retrieve_mem_rec_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_retrieve_mem_rec_op_t
+ */
+ iv_retrieve_mem_rec_op_t s_ivd_retrieve_mem_rec_op_t;
+}ihevcd_cxa_retrieve_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/* Initialize decoder */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_init_ip_t
+ */
+ ivd_init_ip_t s_ivd_init_ip_t;
+
+ /**
+ * level
+ */
+ WORD32 i4_level;
+
+ /**
+ * num_reorder_frames
+ */
+ UWORD32 u4_num_reorder_frames;
+
+ /**
+ * num_ref_frames
+ */
+ UWORD32 u4_num_ref_frames;
+
+ /**
+ * share_disp_buf
+ */
+ UWORD32 u4_share_disp_buf;
+
+ /**
+ * Number of extra display buffers that will be allocated to handle display pipeline depth
+ */
+ UWORD32 u4_num_extra_disp_buf;
+}ihevcd_cxa_init_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_init_op_t
+ */
+ ivd_init_op_t s_ivd_init_op_t;
+}ihevcd_cxa_init_op_t;
+
+
+/*****************************************************************************/
+/* Video Decode */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_video_decode_ip_t
+ */
+ ivd_video_decode_ip_t s_ivd_video_decode_ip_t;
+}ihevcd_cxa_video_decode_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_video_decode_op_t
+ */
+ ivd_video_decode_op_t s_ivd_video_decode_op_t;
+}ihevcd_cxa_video_decode_op_t;
+
+
+/*****************************************************************************/
+/* Get Display Frame */
+/*****************************************************************************/
+
+
+typedef struct
+{
+ /**
+ * ivd_get_display_frame_ip_t
+ */
+ ivd_get_display_frame_ip_t s_ivd_get_display_frame_ip_t;
+}ihevcd_cxa_get_display_frame_ip_t;
+
+
+typedef struct
+{
+ /**
+ * ivd_get_display_frame_op_t
+ */
+ ivd_get_display_frame_op_t s_ivd_get_display_frame_op_t;
+}ihevcd_cxa_get_display_frame_op_t;
+
+/*****************************************************************************/
+/* Set Display Frame */
+/*****************************************************************************/
+
+
+typedef struct
+{
+ /**
+ * ivd_set_display_frame_ip_t
+ */
+ ivd_set_display_frame_ip_t s_ivd_set_display_frame_ip_t;
+}ihevcd_cxa_set_display_frame_ip_t;
+
+
+typedef struct
+{
+ /**
+ * ivd_set_display_frame_op_t
+ */
+ ivd_set_display_frame_op_t s_ivd_set_display_frame_op_t;
+}ihevcd_cxa_set_display_frame_op_t;
+
+/*****************************************************************************/
+/* Release Display Buffers */
+/*****************************************************************************/
+
+
+typedef struct
+{
+ /**
+ * ivd_rel_display_frame_ip_t
+ */
+
+ ivd_rel_display_frame_ip_t s_ivd_rel_display_frame_ip_t;
+}ihevcd_cxa_rel_display_frame_ip_t;
+
+
+typedef struct
+{
+ /**
+ * ivd_rel_display_frame_op_t
+ */
+ ivd_rel_display_frame_op_t s_ivd_rel_display_frame_op_t;
+}ihevcd_cxa_rel_display_frame_op_t;
+
+
+typedef enum
+{
+ /** Set number of cores/threads to be used */
+ IHEVCD_CXA_CMD_CTL_SET_NUM_CORES = IVD_CMD_CTL_CODEC_SUBCMD_START,
+
+ /** Set processor details */
+ IHEVCD_CXA_CMD_CTL_SET_PROCESSOR = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x001,
+
+ /** Get display buffer dimensions */
+ IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x100,
+
+ /** Get VUI parameters */
+ IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x101,
+
+ /** Enable/disable GPU, supported on select platforms */
+ IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x200,
+
+ /** Set degrade level */
+ IHEVCD_CXA_CMD_CTL_DEGRADE = IVD_CMD_CTL_CODEC_SUBCMD_START + 0x300
+}IHEVCD_CXA_CMD_CTL_SUB_CMDS;
+/*****************************************************************************/
+/* Video control Flush */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_flush_ip_t
+ */
+ ivd_ctl_flush_ip_t s_ivd_ctl_flush_ip_t;
+}ihevcd_cxa_ctl_flush_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_flush_op_t
+ */
+ ivd_ctl_flush_op_t s_ivd_ctl_flush_op_t;
+}ihevcd_cxa_ctl_flush_op_t;
+
+/*****************************************************************************/
+/* Video control reset */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_reset_ip_t
+ */
+ ivd_ctl_reset_ip_t s_ivd_ctl_reset_ip_t;
+}ihevcd_cxa_ctl_reset_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_reset_op_t
+ */
+ ivd_ctl_reset_op_t s_ivd_ctl_reset_op_t;
+}ihevcd_cxa_ctl_reset_op_t;
+
+
+/*****************************************************************************/
+/* Video control Set Params */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_set_config_ip_t
+ */
+ ivd_ctl_set_config_ip_t s_ivd_ctl_set_config_ip_t;
+}ihevcd_cxa_ctl_set_config_ip_t;
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_set_config_op_t
+ */
+ ivd_ctl_set_config_op_t s_ivd_ctl_set_config_op_t;
+}ihevcd_cxa_ctl_set_config_op_t;
+
+/*****************************************************************************/
+/* Video control:Get Buf Info */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_getbufinfo_ip_t
+ */
+ ivd_ctl_getbufinfo_ip_t s_ivd_ctl_getbufinfo_ip_t;
+}ihevcd_cxa_ctl_getbufinfo_ip_t;
+
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_getbufinfo_op_t
+ */
+ ivd_ctl_getbufinfo_op_t s_ivd_ctl_getbufinfo_op_t;
+}ihevcd_cxa_ctl_getbufinfo_op_t;
+
+
+/*****************************************************************************/
+/* Video control:Getstatus Call */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_getstatus_ip_t
+ */
+ ivd_ctl_getstatus_ip_t s_ivd_ctl_getstatus_ip_t;
+}ihevcd_cxa_ctl_getstatus_ip_t;
+
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_getstatus_op_t
+ */
+ ivd_ctl_getstatus_op_t s_ivd_ctl_getstatus_op_t;
+}ihevcd_cxa_ctl_getstatus_op_t;
+
+
+/*****************************************************************************/
+/* Video control:Get Version Info */
+/*****************************************************************************/
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_getversioninfo_ip_t
+ */
+ ivd_ctl_getversioninfo_ip_t s_ivd_ctl_getversioninfo_ip_t;
+}ihevcd_cxa_ctl_getversioninfo_ip_t;
+
+
+
+typedef struct {
+
+ /**
+ * ivd_ctl_getversioninfo_op_t
+ */
+ ivd_ctl_getversioninfo_op_t s_ivd_ctl_getversioninfo_op_t;
+}ihevcd_cxa_ctl_getversioninfo_op_t;
+
+
+typedef struct {
+
+ /**
+ * u4_size
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+
+ /**
+ * Pictures that are are degraded
+ * 0 : No degrade
+ * 1 : Only on non-reference frames
+ * 2 : Use interval specified by u4_nondegrade_interval
+ * 3 : All non-key frames
+ * 4 : All frames
+ */
+ WORD32 i4_degrade_pics;
+
+ /**
+ * Interval for pictures which are completely decoded without any degradation
+ */
+ WORD32 i4_nondegrade_interval;
+
+ /**
+ * bit position (lsb is zero): Type of degradation
+ * 0 : Disable SAO
+ * 1 : Disable deblocking
+ * 2 : Faster inter prediction filters
+ * 3 : Fastest inter prediction filters
+ */
+ WORD32 i4_degrade_type;
+
+}ihevcd_cxa_ctl_degrade_ip_t;
+
+typedef struct
+{
+ /**
+ * u4_size
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+}ihevcd_cxa_ctl_degrade_op_t;
+
+typedef struct
+{
+
+ /**
+ * size
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub_cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+
+ /**
+ * num_cores
+ */
+ UWORD32 u4_num_cores;
+}ihevcd_cxa_ctl_set_num_cores_ip_t;
+
+typedef struct
+{
+
+ /**
+ * size
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+}ihevcd_cxa_ctl_set_num_cores_op_t;
+
+typedef struct
+{
+ /**
+ * size
+ */
+ UWORD32 u4_size;
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+ /**
+ * sub cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+ /**
+ * Processor type
+ */
+ UWORD32 u4_arch;
+ /**
+ * SOC type
+ */
+ UWORD32 u4_soc;
+
+ /**
+ * num_cores
+ */
+ UWORD32 u4_num_cores;
+
+}ihevcd_cxa_ctl_set_processor_ip_t;
+
+typedef struct
+{
+ /**
+ * size
+ */
+ UWORD32 u4_size;
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+}ihevcd_cxa_ctl_set_processor_op_t;
+
+typedef struct
+{
+
+ /**
+ * size
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IVD_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * sub cmd
+ */
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+}ihevcd_cxa_ctl_get_frame_dimensions_ip_t;
+
+
+typedef struct {
+
+ /**
+ * size
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * x_offset[3]
+ */
+ UWORD32 u4_x_offset[3];
+
+ /**
+ * y_offset[3]
+ */
+ UWORD32 u4_y_offset[3];
+
+ /**
+ * disp_wd[3]
+ */
+ UWORD32 u4_disp_wd[3];
+
+ /**
+ * disp_ht[3]
+ */
+ UWORD32 u4_disp_ht[3];
+
+ /**
+ * buffer_wd[3]
+ */
+ UWORD32 u4_buffer_wd[3];
+
+ /**
+ * buffer_ht[3]
+ */
+ UWORD32 u4_buffer_ht[3];
+}ihevcd_cxa_ctl_get_frame_dimensions_op_t;
+
+typedef struct {
+ UWORD32 u4_size;
+ IVD_API_COMMAND_TYPE_T e_cmd;
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+}ihevcd_cxa_ctl_get_vui_params_ip_t;
+
+typedef struct {
+ UWORD32 u4_size;
+ UWORD32 u4_error_code;
+
+ /**
+ * indicates the presence of aspect_ratio
+ */
+ UWORD8 u1_aspect_ratio_info_present_flag;
+
+ /**
+ * specifies the aspect ratio of the luma samples
+ */
+ UWORD8 u1_aspect_ratio_idc;
+
+ /**
+ * width of the luma samples. user dependent
+ */
+ UWORD16 u2_sar_width;
+
+ /**
+ * hieght of the luma samples. user dependent
+ */
+ UWORD16 u2_sar_height;
+
+ /**
+ * if 1, specifies that the overscan_appropriate_flag is present
+ * if 0, the preferred display method for the video signal is unspecified
+ */
+ UWORD8 u1_overscan_info_present_flag;
+
+ /**
+ * if 1,indicates that the cropped decoded pictures output
+ * are suitable for display using overscan
+ */
+ UWORD8 u1_overscan_appropriate_flag;
+
+ /**
+ * if 1 specifies that video_format, video_full_range_flag and
+ * colour_description_present_flag are present
+ */
+ UWORD8 u1_video_signal_type_present_flag;
+
+ /**
+ *
+ */
+ UWORD8 u1_video_format;
+
+ /**
+ * indicates the black level and range of the luma and chroma signals
+ */
+ UWORD8 u1_video_full_range_flag;
+
+ /**
+ * if 1,to 1 specifies that colour_primaries, transfer_characteristics
+ * and matrix_coefficients are present
+ */
+ UWORD8 u1_colour_description_present_flag;
+
+ /**
+ * indicates the chromaticity coordinates of the source primaries
+ */
+ UWORD8 u1_colour_primaries;
+
+ /**
+ * indicates the opto-electronic transfer characteristic of the source picture
+ */
+ UWORD8 u1_transfer_characteristics;
+
+ /**
+ * the matrix coefficients used in deriving luma and chroma signals
+ * from the green, blue, and red primaries
+ */
+ UWORD8 u1_matrix_coefficients;
+
+ /**
+ * if 1, specifies that chroma_sample_loc_type_top_field and
+ * chroma_sample_loc_type_bottom_field are present
+ */
+ UWORD8 u1_chroma_loc_info_present_flag;
+
+ /**
+ * location of chroma samples
+ */
+ UWORD8 u1_chroma_sample_loc_type_top_field;
+
+ UWORD8 u1_chroma_sample_loc_type_bottom_field;
+
+ /**
+ * if 1, indicates that the value of all decoded chroma samples is
+ * equal to 1 << ( BitDepthC - 1 )
+ */
+ UWORD8 u1_neutral_chroma_indication_flag;
+
+ /**
+ * 1 indicates that the coded video sequence conveys pictures that represent fields
+ * 0 indicates the pictures that represents field
+ */
+ UWORD8 u1_field_seq_flag;
+
+ /**
+ * specifies that picture timing SEI messages are present for every picture
+ */
+ UWORD8 u1_frame_field_info_present_flag;
+
+ /**
+ * 1 indicates that the default display window parameters follow next in the VUI
+ */
+ UWORD8 u1_default_display_window_flag;
+
+ /**
+ * specify the samples of the pictures in the coded video sequence
+ * that are within the default display window,
+ * in terms of a rectangular region specified in picture coordinates for display
+ */
+ UWORD32 u4_def_disp_win_left_offset;
+
+ UWORD32 u4_def_disp_win_right_offset;
+
+ UWORD32 u4_def_disp_win_top_offset;
+
+ UWORD32 u4_def_disp_win_bottom_offset;
+
+ /**
+ * to 1 specifies that the syntax structure hrd_parameters is present in the vui_parameters syntax structue
+ */
+ UWORD8 u1_vui_hrd_parameters_present_flag;
+
+ /**
+ * Indicates the presence of the
+ * num_units_in_ticks, time_scale flag
+ */
+ UWORD8 u1_vui_timing_info_present_flag;
+
+ /**
+ * Number of units that
+ * correspond to one increment of the
+ * clock. Indicates the resolution
+ */
+ UWORD32 u4_vui_num_units_in_tick;
+
+ /**
+ * The number of time units that pass in one second
+ */
+ UWORD32 u4_vui_time_scale;
+ /**
+ * if 1, indicates that the POC for each picture in the coded video sequence (cvs) (not the first picture), in decoding order,
+ * is proportional to the output time of the picture relative to that of the first picture in the cvs
+ */
+ UWORD8 u1_poc_proportional_to_timing_flag;
+
+ /**
+ * num_ticks_poc_diff_one_minus1 plus 1 specifies the number of clock ticks
+ * corresponding to a difference of poc values equal to 1
+ */
+ UWORD8 u1_num_ticks_poc_diff_one_minus1;
+
+ /**
+ * 1, specifies that the following cvs bitstream restriction parameters are present
+ */
+ UWORD8 u1_bitstream_restriction_flag;
+
+ /**
+ * if 1, indicates that each pps that is active in the cvs has
+ * the same value of the tile syntax elements
+ */
+ UWORD8 u1_tiles_fixed_structure_flag;
+
+ /**
+ * if 0, indicates that no pel outside the pic boundaries and
+ * no sub-pels derived using pels outside the pic boundaries is used for inter prediction
+ */
+ UWORD8 u1_motion_vectors_over_pic_boundaries_flag;
+
+ /**
+ * if 1, indicates
+ * all P/B slices belonging to the same pic have an identical refpic list0,
+ * all B slices that belong to the same picture have an identical refpic list1.
+ */
+ UWORD8 u1_restricted_ref_pic_lists_flag;
+
+ /**
+ *
+ */
+ UWORD8 u4_min_spatial_segmentation_idc;
+ /**
+ * Indicates a number of bytes not exceeded by the sum of the sizes of the VCL NAL units
+ * associated with any coded picture
+ */
+ UWORD8 u1_max_bytes_per_pic_denom;
+
+ /**
+ * Indicates an upper bound for the number of bits of coding_unit() data
+ */
+ UWORD8 u1_max_bits_per_mincu_denom;
+
+ /**
+ * Indicate the maximum absolute value of a decoded horizontal MV component
+ * in quarter-pel luma units
+ */
+ UWORD8 u1_log2_max_mv_length_horizontal;
+
+ /**
+ * Indicate the maximum absolute value of a decoded vertical MV component
+ * in quarter-pel luma units
+ */
+ UWORD8 u1_log2_max_mv_length_vertical;
+
+ /**
+ * HRD parameters
+ */
+
+
+ /**
+ * Indicates the presence of the
+ * num_units_in_ticks, time_scale flag
+ */
+ UWORD8 u1_timing_info_present_flag;
+
+ /**
+ * Number of units that
+ * correspond to one increment of the
+ * clock. Indicates the resolution
+ */
+ UWORD32 u4_num_units_in_tick;
+
+ /**
+ * The number of time units that pass in one second
+ */
+ UWORD32 u4_time_scale;
+
+ /**
+ * Nal- hrd parameters flag
+ */
+ UWORD8 u1_nal_hrd_parameters_present_flag;
+
+ /**
+ * VCL- hrd parameters flag
+ */
+ UWORD8 u1_vcl_hrd_parameters_present_flag;
+
+ /**
+ * Indicates the presence of NAL-HRD params or VCL_HRD params
+ * in the bitstream
+ */
+ UWORD8 u1_cpbdpb_delays_present_flag;
+
+ /**
+ * specifies that sub-picture level CPB removal delay parameters are
+ * present in picture timing SEI messages
+ */
+ UWORD8 u1_sub_pic_cpb_params_present_flag;
+
+ /**
+ * specify the clock sub-tick
+ * (the minimum interval of time that can be represented in the coded data when sub_pic_cpb_params_present_flag is equal to 1)
+ */
+ UWORD8 u1_tick_divisor_minus2;
+
+ /**
+ * specifies the length, in bits for the du cpb delay syntax in pt_sei
+ */
+ UWORD8 u1_du_cpb_removal_delay_increment_length_minus1;
+
+ /**
+ * Indicates presence of sub_pic_cpb_params in pic timing sei
+ */
+ UWORD8 u1_sub_pic_cpb_params_in_pic_timing_sei_flag;
+
+ /**
+ * Indicates dpb output delay for the du
+ */
+ UWORD8 u1_dpb_output_delay_du_length_minus1;
+
+ /**
+ * (together with bit_rate_value_minus1) specifies the
+ * maximum input bit rate of the i-th CPB
+ */
+ UWORD8 u4_bit_rate_scale;
+
+ /**
+ * (together with cpb_size_du_value_minus1) specfies
+ * CPB size of the i-th CPB when the CPB operates
+ * at the access unit level
+ */
+ UWORD8 u4_cpb_size_scale;
+
+ /**
+ * (together with cpb_size_du_value_minus1) specfies
+ * CPB size of the i-th CPB when the CPB operates
+ * at the sub-picture level
+ */
+ UWORD8 u4_cpb_size_du_scale;
+
+
+ /**
+ * specifies the length, in bits for initial cpb delay (nal/vcl)sysntax in bp sei
+ */
+ UWORD8 u1_initial_cpb_removal_delay_length_minus1;
+
+ /**
+ * specifies the length, in bits for the au cpb delay syntax in pt_sei
+ */
+ UWORD8 u1_au_cpb_removal_delay_length_minus1;
+
+ /**
+ * specifies the length, in bits, of the pic_dpb_output_delay syntax element in the pt SEI message
+ */
+ UWORD8 u1_dpb_output_delay_length_minus1;
+
+ /**
+ * if 1, , for the highest temporal sub-layers, the temporal distance between the HRD output times
+ * of consecutive pictures in output order is constrained refer to Table E-6
+ */
+ UWORD8 au1_fixed_pic_rate_general_flag[6];
+
+ UWORD8 au1_fixed_pic_rate_within_cvs_flag[6];
+
+ /**
+ * if 1, , for the highest temporal sub-layers, the temporal distance (in clock ticks) between the
+ * element units that specify HRD output times of consecutive pictures in output order is constrained
+ * refer to Table E-6
+ */
+ UWORD8 au1_elemental_duration_in_tc_minus1[6];
+
+ /**
+ * specifies the HRD operational mode
+ */
+ UWORD8 au1_low_delay_hrd_flag[6];
+
+ /**
+ * 1 specifies the number of alternative CPB specifications in the
+ * bitstream of the cvs when HighestTid is equal to i
+ */
+ UWORD8 au1_cpb_cnt_minus1[6];
+}ihevcd_cxa_ctl_get_vui_params_op_t;
+
+#ifdef GPU_BUILD
+typedef struct {
+ UWORD32 u4_size;
+ IVD_API_COMMAND_TYPE_T e_cmd;
+ IVD_CONTROL_API_COMMAND_TYPE_T e_sub_cmd;
+ UWORD32 u4_gpu_enable_diable; // 1 - Enable 0 - Diable
+}ihevcd_cxa_ctl_gpu_enable_diable_ip_t;
+
+typedef struct {
+ UWORD32 u4_size;
+ UWORD32 u4_error_code;
+}ihevcd_cxa_ctl_gpu_enable_diable_op_t;
+#endif
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif /* __IHEVCD_CXA_H__ */
diff --git a/decoder/ihevcd_deblk.c b/decoder/ihevcd_deblk.c
new file mode 100644
index 0000000..652bf8c
--- /dev/null
+++ b/decoder/ihevcd_deblk.c
@@ -0,0 +1,849 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_deblk.c
+*
+* @brief
+* Contains definition for the ctb level deblk function
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_deblk()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_debug.h"
+
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevcd_profile.h"
+/**
+*******************************************************************************
+*
+* @brief
+* Deblock CTB level function.
+*
+* @par Description:
+* For a given CTB, deblocking on both vertical and
+* horizontal edges is done. Both the luma and chroma
+* blocks are processed
+*
+* @param[in] ps_deblk
+* Pointer to the deblock context
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void ihevcd_deblk_ctb(deblk_ctxt_t *ps_deblk,
+ WORD32 i4_is_last_ctb_x,
+ WORD32 i4_is_last_ctb_y)
+{
+ WORD32 ctb_size;
+ WORD32 log2_ctb_size;
+ UWORD32 u4_bs;
+ WORD32 bs_tz; /*Leading zeros in boundary strength*/
+ WORD32 qp_p, qp_q;
+
+ WORD32 filter_p, filter_q;
+
+ UWORD8 *pu1_src;
+ WORD32 qp_strd;
+ UWORD32 *pu4_vert_bs, *pu4_horz_bs;
+ UWORD32 *pu4_ctb_vert_bs, *pu4_ctb_horz_bs;
+ WORD32 vert_bs_strd, horz_bs_strd;
+ WORD32 src_strd;
+ UWORD8 *pu1_qp;
+ UWORD16 *pu2_ctb_no_loop_filter_flag;
+ UWORD16 au2_ctb_no_loop_filter_flag[9];
+
+ WORD32 col, row;
+
+ /* Flag to indicate if QP is constant in CTB
+ * 0 - top_left, 1 - top, 2 - left, 3 - current */
+ UWORD32 u4_qp_const_in_ctb[4] = { 0, 0, 0, 0 };
+ WORD32 ctb_indx;
+ WORD32 chroma_yuv420sp_vu = ps_deblk->is_chroma_yuv420sp_vu;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ codec_t *ps_codec;
+ slice_header_t *ps_slice_hdr;
+
+ PROFILE_DISABLE_DEBLK();
+
+ ps_sps = ps_deblk->ps_sps;
+ ps_pps = ps_deblk->ps_pps;
+ ps_codec = ps_deblk->ps_codec;
+ ps_slice_hdr = ps_deblk->ps_slice_hdr;
+
+ log2_ctb_size = ps_sps->i1_log2_ctb_size;
+ ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+ /* strides are in units of number of bytes */
+ /* ctb_size * ctb_size / 8 / 16 is the number of bytes needed per CTB */
+ vert_bs_strd = ps_sps->i2_pic_wd_in_ctb << (2 * log2_ctb_size - 7);
+ horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) << (2 * log2_ctb_size - 7);
+ pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_deblk->s_bs_ctxt.pu4_pic_vert_bs +
+ (ps_deblk->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+ ps_deblk->i4_ctb_y * vert_bs_strd);
+ pu4_ctb_vert_bs = pu4_vert_bs;
+
+ pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_deblk->s_bs_ctxt.pu4_pic_horz_bs +
+ (ps_deblk->i4_ctb_x << (2 * log2_ctb_size - 7)) +
+ ps_deblk->i4_ctb_y * horz_bs_strd);
+ pu4_ctb_horz_bs = pu4_horz_bs;
+
+ qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3);
+ pu1_qp = ps_deblk->s_bs_ctxt.pu1_pic_qp + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * qp_strd) << (log2_ctb_size - 3));
+
+ pu2_ctb_no_loop_filter_flag = ps_deblk->au2_ctb_no_loop_filter_flag;
+
+ ctb_indx = ps_deblk->i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_deblk->i4_ctb_y;
+ if(i4_is_last_ctb_y)
+ {
+ pu4_vert_bs = (UWORD32 *)((UWORD8 *)pu4_vert_bs + vert_bs_strd);
+ pu4_ctb_vert_bs = pu4_vert_bs;
+ /* ctb_size/8 is the number of edges per CTB
+ * ctb_size/4 is the number of BS values needed per edge
+ * divided by 8 for the number of bytes
+ * 2 is the number of bits needed for each BS value */
+ memset(pu4_vert_bs, 0, 1 << (2 * log2_ctb_size - 7));
+
+ pu1_qp += (qp_strd << (log2_ctb_size - 3));
+ pu2_ctb_no_loop_filter_flag += (ctb_size >> 3);
+ ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+ }
+
+ if(i4_is_last_ctb_x)
+ {
+ pu4_horz_bs = (UWORD32 *)((UWORD8 *)pu4_horz_bs + (1 << (2 * log2_ctb_size - 7)));
+ pu4_ctb_horz_bs = pu4_horz_bs;
+ memset(pu4_horz_bs, 0, 1 << (2 * log2_ctb_size - 7));
+
+ pu1_qp += (ctb_size >> 3);
+
+ for(row = 0; row < (ctb_size >> 3) + 1; row++)
+ au2_ctb_no_loop_filter_flag[row] = ps_deblk->au2_ctb_no_loop_filter_flag[row] >> (ctb_size >> 3);
+ pu2_ctb_no_loop_filter_flag = au2_ctb_no_loop_filter_flag;
+ ctb_indx += 1;
+ }
+
+ u4_qp_const_in_ctb[3] = ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx) >> 3] & (1 << (ctb_indx & 7));
+
+ if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x)
+ {
+ u4_qp_const_in_ctb[2] = ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - 1) >> 3] & (1 << ((ctb_indx - 1) & 7));
+ }
+
+ if((ps_deblk->i4_ctb_x || i4_is_last_ctb_x) && (ps_deblk->i4_ctb_y || i4_is_last_ctb_y))
+ {
+ u4_qp_const_in_ctb[0] =
+ ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - ps_sps->i2_pic_wd_in_ctb - 1) >> 3] &
+ (1 << ((ctb_indx - ps_sps->i2_pic_wd_in_ctb - 1) & 7));
+ }
+
+
+
+ if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y)
+ {
+ u4_qp_const_in_ctb[1] =
+ ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - ps_sps->i2_pic_wd_in_ctb) >> 3] &
+ (1 << ((ctb_indx - ps_sps->i2_pic_wd_in_ctb) & 7));
+ }
+
+ src_strd = ps_codec->i4_strd;
+
+ /* Luma Vertical Edge */
+
+ if(0 == i4_is_last_ctb_x)
+ {
+ /* Top CTB's slice header */
+ slice_header_t *ps_slice_hdr_top;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_y)
+ cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+ ps_slice_hdr_top = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+ }
+#else
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_y)
+ cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+ ps_slice_hdr_top = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+ }
+#endif
+
+ pu1_src = ps_deblk->pu1_cur_pic_luma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd) << (log2_ctb_size));
+ pu1_src += i4_is_last_ctb_y ? ps_deblk->ps_codec->i4_strd << log2_ctb_size : 0;
+
+ /** Deblocking is done on a shifted CTB -
+ * Vertical edge processing is done by shifting the CTB up by four pixels */
+ pu1_src -= 4 * src_strd;
+
+ for(col = 0; col < ctb_size / 8; col++)
+ {
+ WORD32 shift = 0;
+
+ /* downshift vert_bs by ctb_size/2 for each column
+ * shift = (col & ((MAX_CTB_SIZE >> log2_ctb_size) - 1)) << (log2_ctb_size - 1);
+ * which will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift = (col & 1) << (log2_ctb_size - 1);
+
+ /* BS for the column - Last row is excluded and the top row is included*/
+ u4_bs = (pu4_vert_bs[0] >> shift) << 2;
+
+ if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y)
+ {
+ /* Picking the last BS of the previous CTB corresponding to the same column */
+ UWORD32 *pu4_vert_bs_top = (UWORD32 *)((UWORD8 *)pu4_vert_bs - vert_bs_strd);
+ UWORD32 u4_top_bs = (*pu4_vert_bs_top) >> (shift + (1 << (log2_ctb_size - 1)) - 2);
+ u4_bs |= u4_top_bs & 3;
+ }
+
+ for(row = 0; row < ctb_size / 4;)
+ {
+ WORD8 i1_beta_offset_div2 = ps_slice_hdr->i1_beta_offset_div2;
+ WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+ /* Trailing zeros are computed and the corresponding rows are not processed */
+ bs_tz = CTZ(u4_bs) >> 1;
+ if(0 != bs_tz)
+ {
+ u4_bs = u4_bs >> (bs_tz << 1);
+ if((row + bs_tz) >= (ctb_size / 4))
+ pu1_src += 4 * (ctb_size / 4 - row) * src_strd;
+ else
+ pu1_src += 4 * bs_tz * src_strd;
+
+ row += bs_tz;
+ continue;
+ }
+
+ if(0 == row)
+ {
+ i1_beta_offset_div2 = ps_slice_hdr_top->i1_beta_offset_div2;
+ i1_tc_offset_div2 = ps_slice_hdr_top->i1_tc_offset_div2;
+
+ if(0 == col)
+ {
+ qp_p = u4_qp_const_in_ctb[0] ?
+ pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+ pu1_qp[-qp_strd - 1];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[1] ?
+ pu1_qp[-ctb_size / 8 * qp_strd] :
+ pu1_qp[col - 1 - qp_strd];
+ }
+
+ qp_q = u4_qp_const_in_ctb[1] ?
+ pu1_qp[-ctb_size / 8 * qp_strd] :
+ pu1_qp[col - qp_strd];
+ }
+ else
+ {
+ if(0 == col)
+ {
+ qp_p = u4_qp_const_in_ctb[2] ?
+ pu1_qp[-ctb_size / 8] :
+ pu1_qp[((row - 1) >> 1) * qp_strd - 1];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[((row - 1) >> 1) * qp_strd + col - 1];
+ }
+
+ qp_q = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[((row - 1) >> 1) * qp_strd + col];
+ }
+
+ filter_p = (pu2_ctb_no_loop_filter_flag[(row + 1) >> 1] >> col) & 1;
+ filter_q = (pu2_ctb_no_loop_filter_flag[(row + 1) >> 1] >> col) & 2;
+ /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+ filter_p = !filter_p;
+ filter_q = !filter_q;
+
+ if(filter_p || filter_q)
+ {
+#if DEBUG_DEBLK_LEAF_LEVEL
+ {
+ DUMP_DEBLK_LUMA_VERT(pu1_src, src_strd,
+ u4_bs & 3, qp_p, qp_q,
+ ps_slice_hdr->i1_beta_offset_div2,
+ ps_slice_hdr->i1_tc_offset_div2,
+ filter_p, filter_q);
+ }
+#endif
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr(pu1_src, src_strd,
+ u4_bs & 3, qp_p, qp_q,
+ i1_beta_offset_div2,
+ i1_tc_offset_div2,
+ filter_p, filter_q);
+ }
+
+ pu1_src += 4 * src_strd;
+ u4_bs = u4_bs >> 2;
+ row++;
+ }
+
+ if((64 == ctb_size) ||
+ ((32 == ctb_size) && (col & 1)))
+ {
+ pu4_vert_bs++;
+ }
+ pu1_src -= (src_strd << log2_ctb_size);
+ pu1_src += 8;
+ }
+ pu4_vert_bs = pu4_ctb_vert_bs;
+ }
+
+
+ /* Luma Horizontal Edge */
+
+ if(0 == i4_is_last_ctb_y)
+ {
+
+ /* Left CTB's slice header */
+ slice_header_t *ps_slice_hdr_left;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_x)
+ cur_ctb_indx += 1;
+ ps_slice_hdr_left = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+ }
+#else
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_x)
+ cur_ctb_indx += 1;
+ ps_slice_hdr_left = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+ }
+#endif
+ pu1_src = ps_deblk->pu1_cur_pic_luma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd) << log2_ctb_size);
+ pu1_src += i4_is_last_ctb_x ? ctb_size : 0;
+
+ /** Deblocking is done on a shifted CTB -
+ * Horizontal edge processing is done by shifting the CTB left by four pixels */
+ pu1_src -= 4;
+ for(row = 0; row < ctb_size / 8; row++)
+ {
+ WORD32 shift = 0;
+
+ /* downshift vert_bs by ctb_size/2 for each column
+ * shift = (row & (MAX_CTB_SIZE / ctb_size - 1)) * ctb_size / 2;
+ * which will reduce to the following assuming ctb size is one of 16, 32 and 64
+ * and deblocking is done on 8x8 grid
+ */
+ if(6 != log2_ctb_size)
+ shift = (row & 1) << (log2_ctb_size - 1);
+
+ /* BS for the row - Last column is excluded and the left column is included*/
+ u4_bs = (pu4_horz_bs[0] >> shift) << 2;
+
+ if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x)
+ {
+ /** Picking the last BS of the previous CTB corresponding to the same row
+ * UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+ */
+ UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (1 << (2 * log2_ctb_size - 7)));
+ UWORD32 u4_left_bs = (*pu4_horz_bs_left) >> (shift + (1 << (log2_ctb_size - 1)) - 2);
+ u4_bs |= u4_left_bs & 3;
+ }
+
+ for(col = 0; col < ctb_size / 4;)
+ {
+ WORD8 i1_beta_offset_div2 = ps_slice_hdr->i1_beta_offset_div2;
+ WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+ bs_tz = CTZ(u4_bs) >> 1;
+ if(0 != bs_tz)
+ {
+ u4_bs = u4_bs >> (bs_tz << 1);
+
+ if((col + bs_tz) >= (ctb_size / 4))
+ pu1_src += 4 * (ctb_size / 4 - col);
+ else
+ pu1_src += 4 * bs_tz;
+
+ col += bs_tz;
+ continue;
+ }
+
+ if(0 == col)
+ {
+ i1_beta_offset_div2 = ps_slice_hdr_left->i1_beta_offset_div2;
+ i1_tc_offset_div2 = ps_slice_hdr_left->i1_tc_offset_div2;
+
+ if(0 == row)
+ {
+ qp_p = u4_qp_const_in_ctb[0] ?
+ pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+ pu1_qp[-qp_strd - 1];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[2] ?
+ pu1_qp[-ctb_size / 8] :
+ pu1_qp[(row - 1) * qp_strd - 1];
+ }
+
+ qp_q = u4_qp_const_in_ctb[2] ?
+ pu1_qp[-ctb_size / 8] :
+ pu1_qp[row * qp_strd - 1];
+ }
+ else
+ {
+ if(0 == row)
+ {
+ qp_p = u4_qp_const_in_ctb[1] ?
+ pu1_qp[-ctb_size / 8 * qp_strd] :
+ pu1_qp[((col - 1) >> 1) - qp_strd];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[((col - 1) >> 1) + (row - 1) * qp_strd];
+ }
+
+ qp_q = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[((col - 1) >> 1) + row * qp_strd];
+ }
+
+ filter_p = (pu2_ctb_no_loop_filter_flag[row] >> ((col + 1) >> 1)) & 1;
+ filter_q = (pu2_ctb_no_loop_filter_flag[row + 1] >> ((col + 1) >> 1)) & 1;
+ /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+ filter_p = !filter_p;
+ filter_q = !filter_q;
+
+ if(filter_p || filter_q)
+ {
+#if DEBUG_DEBLK_LEAF_LEVEL
+ {
+ DUMP_DEBLK_LUMA_HORZ(pu1_src, src_strd,
+ u4_bs & 3, qp_p, qp_q,
+ ps_slice_hdr->i1_beta_offset_div2,
+ ps_slice_hdr->i1_tc_offset_div2,
+ filter_p, filter_q);
+ }
+#endif
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr(pu1_src, src_strd,
+ u4_bs & 3, qp_p, qp_q,
+ i1_beta_offset_div2,
+ i1_tc_offset_div2, filter_p, filter_q);
+ }
+
+ pu1_src += 4;
+ u4_bs = u4_bs >> 2;
+ col++;
+ }
+
+ if((64 == ctb_size) ||
+ ((32 == ctb_size) && (row & 1)))
+ {
+ pu4_horz_bs++;
+ }
+ pu1_src -= ctb_size;
+ pu1_src += (src_strd << 3);
+ }
+ pu4_horz_bs = pu4_ctb_horz_bs;
+ }
+
+
+ /* Chroma Veritcal Edge */
+
+ if(0 == i4_is_last_ctb_x)
+ {
+
+ /* Top CTB's slice header */
+ slice_header_t *ps_slice_hdr_top;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_y)
+ cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+ ps_slice_hdr_top = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+ }
+#else
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_y)
+ cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb;
+ ps_slice_hdr_top = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb];
+ }
+#endif
+
+ pu1_src = ps_deblk->pu1_cur_pic_chroma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size);
+ pu1_src += i4_is_last_ctb_y ? (ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size : 0;
+
+ /** Deblocking is done on a shifted CTB -
+ * Vertical edge processing is done by shifting the CTB up by four pixels */
+ pu1_src -= 4 * src_strd;
+
+ for(col = 0; col < ctb_size / 16; col++)
+ {
+
+ /* BS for the column - Last row is excluded and the top row is included*/
+ u4_bs = pu4_vert_bs[0] << 2;
+
+ if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y)
+ {
+ /* Picking the last BS of the previous CTB corresponding to the same column */
+ UWORD32 *pu4_vert_bs_top = (UWORD32 *)((UWORD8 *)pu4_vert_bs - vert_bs_strd);
+ UWORD32 u4_top_bs = (*pu4_vert_bs_top) >> ((1 << (log2_ctb_size - 1)) - 2);
+ u4_bs |= u4_top_bs & 3;
+ }
+
+ /* Every alternate boundary strength value is used for chroma */
+ u4_bs &= 0x22222222;
+
+ for(row = 0; row < ctb_size / 8;)
+ {
+ WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+ bs_tz = CTZ(u4_bs) >> 2;
+ if(0 != bs_tz)
+ {
+ if((row + bs_tz) >= (ctb_size / 8))
+ pu1_src += 4 * (ctb_size / 8 - row) * src_strd;
+ else
+ pu1_src += 4 * bs_tz * src_strd;
+ row += bs_tz;
+ u4_bs = u4_bs >> (bs_tz << 2);
+ continue;
+ }
+
+ if(0 == row)
+ {
+ i1_tc_offset_div2 = ps_slice_hdr_top->i1_tc_offset_div2;
+
+ if(0 == col)
+ {
+ qp_p = u4_qp_const_in_ctb[0] ?
+ pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+ pu1_qp[-qp_strd - 1];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[1] ?
+ pu1_qp[-ctb_size / 8 * qp_strd] :
+ pu1_qp[2 * col - 1 - qp_strd];
+ }
+
+ qp_q = u4_qp_const_in_ctb[1] ?
+ pu1_qp[-ctb_size / 8 * qp_strd] :
+ pu1_qp[2 * col - qp_strd];
+ }
+ else
+ {
+ if(0 == col)
+ {
+ qp_p = u4_qp_const_in_ctb[2] ?
+ pu1_qp[-ctb_size / 8] :
+ pu1_qp[(row - 1) * qp_strd - 1];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[(row - 1) * qp_strd + 2 * col - 1];
+ }
+
+ qp_q = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[(row - 1) * qp_strd + 2 * col];
+ }
+
+ filter_p = (pu2_ctb_no_loop_filter_flag[row] >> (col << 1)) & 1;
+ filter_q = (pu2_ctb_no_loop_filter_flag[row] >> (col << 1)) & 2;
+ /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+ filter_p = !filter_p;
+ filter_q = !filter_q;
+
+ if(filter_p || filter_q)
+ {
+ ASSERT(1 == ((u4_bs & 3) >> 1));
+#if DEBUG_DEBLK_LEAF_LEVEL
+ {
+ DUMP_DEBLK_CHROMA_VERT(pu1_src, src_strd,
+ u4_bs & 3, qp_p, qp_q,
+ ps_pps->i1_pic_cb_qp_offset,
+ ps_pps->i1_pic_cr_qp_offset,
+ ps_slice_hdr->i1_tc_offset_div2,
+ filter_p, filter_q);
+ }
+#endif
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr(pu1_src,
+ src_strd,
+ qp_q,
+ qp_p,
+ ps_pps->i1_pic_cr_qp_offset,
+ ps_pps->i1_pic_cb_qp_offset,
+ i1_tc_offset_div2,
+ filter_q,
+ filter_p);
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr(pu1_src,
+ src_strd,
+ qp_p,
+ qp_q,
+ ps_pps->i1_pic_cb_qp_offset,
+ ps_pps->i1_pic_cr_qp_offset,
+ i1_tc_offset_div2,
+ filter_p,
+ filter_q);
+ }
+ }
+
+ pu1_src += 4 * src_strd;
+ u4_bs = u4_bs >> 4;
+ row++;
+ }
+
+ pu4_vert_bs += (64 == ctb_size) ? 2 : 1;
+ pu1_src -= ((src_strd / 2) << log2_ctb_size);
+ pu1_src += 16;
+ }
+ }
+
+ /* Chroma Horizontal Edge */
+
+ if(0 == i4_is_last_ctb_y)
+ {
+
+ /* Left CTB's slice header */
+ slice_header_t *ps_slice_hdr_left;
+#ifdef GPU_BUILD
+//TODO GPU : Later define it for ARM only version as well
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_x)
+ cur_ctb_indx += 1;
+ ps_slice_hdr_left = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+ }
+#else
+ {
+ WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ if(i4_is_last_ctb_x)
+ cur_ctb_indx += 1;
+ ps_slice_hdr_left = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1];
+ }
+#endif
+
+ pu1_src = ps_deblk->pu1_cur_pic_chroma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size);
+ pu1_src += i4_is_last_ctb_x ? ctb_size : 0;
+
+ /** Deblocking is done on a shifted CTB -
+ * Vertical edge processing is done by shifting the CTB up by four pixels (8 here beacuse UV are interleaved) */
+ pu1_src -= 8;
+ for(row = 0; row < ctb_size / 16; row++)
+ {
+ /* BS for the row - Last column is excluded and the left column is included*/
+ u4_bs = pu4_horz_bs[0] << 2;
+
+ if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x)
+ {
+ /** Picking the last BS of the previous CTB corresponding to the same row
+ * UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+ */
+ UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (1 << (2 * log2_ctb_size - 7)));
+ UWORD32 u4_left_bs = (*pu4_horz_bs_left) >> ((1 << (log2_ctb_size - 1)) - 2);
+ u4_bs |= u4_left_bs & 3;
+ }
+
+ /* Every alternate boundary strength value is used for chroma */
+ u4_bs &= 0x22222222;
+
+ for(col = 0; col < ctb_size / 8;)
+ {
+ WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2;
+
+ bs_tz = CTZ(u4_bs) >> 2;
+ if(0 != bs_tz)
+ {
+ u4_bs = u4_bs >> (bs_tz << 2);
+
+ if((col + bs_tz) >= (ctb_size / 8))
+ pu1_src += 8 * (ctb_size / 8 - col);
+ else
+ pu1_src += 8 * bs_tz;
+
+ col += bs_tz;
+ continue;
+ }
+
+ if(0 == col)
+ {
+ i1_tc_offset_div2 = ps_slice_hdr_left->i1_tc_offset_div2;
+
+ if(0 == row)
+ {
+ qp_p = u4_qp_const_in_ctb[0] ?
+ pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] :
+ pu1_qp[-qp_strd - 1];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[2] ?
+ pu1_qp[-ctb_size / 8] :
+ pu1_qp[(2 * row - 1) * qp_strd - 1];
+ }
+
+ qp_q = u4_qp_const_in_ctb[2] ?
+ pu1_qp[-ctb_size / 8] :
+ pu1_qp[(2 * row) * qp_strd - 1];
+ }
+ else
+ {
+ if(0 == row)
+ {
+ qp_p = u4_qp_const_in_ctb[1] ?
+ pu1_qp[-ctb_size / 8 * qp_strd] :
+ pu1_qp[col - 1 - qp_strd];
+ }
+ else
+ {
+ qp_p = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[(col - 1) + (2 * row - 1) * qp_strd];
+ }
+
+ qp_q = u4_qp_const_in_ctb[3] ?
+ pu1_qp[0] :
+ pu1_qp[(col - 1) + 2 * row * qp_strd];
+ }
+
+ filter_p = (pu2_ctb_no_loop_filter_flag[row << 1] >> col) & 1;
+ filter_q = (pu2_ctb_no_loop_filter_flag[(row << 1) + 1] >> col) & 1;
+ /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */
+ filter_p = !filter_p;
+ filter_q = !filter_q;
+
+ if(filter_p || filter_q)
+ {
+ ASSERT(1 == ((u4_bs & 3) >> 1));
+#if DEBUG_DEBLK_LEAF_LEVEL
+ {
+ DUMP_DEBLK_CHROMA_HORZ(pu1_src, src_strd,
+ u4_bs & 3, qp_p, qp_q,
+ ps_pps->i1_pic_cb_qp_offset,
+ ps_pps->i1_pic_cr_qp_offset,
+ ps_slice_hdr->i1_tc_offset_div2,
+ filter_p, filter_q);
+ }
+#endif
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr(pu1_src,
+ src_strd,
+ qp_q,
+ qp_p,
+ ps_pps->i1_pic_cr_qp_offset,
+ ps_pps->i1_pic_cb_qp_offset,
+ i1_tc_offset_div2,
+ filter_q,
+ filter_p);
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr(pu1_src,
+ src_strd,
+ qp_p,
+ qp_q,
+ ps_pps->i1_pic_cb_qp_offset,
+ ps_pps->i1_pic_cr_qp_offset,
+ i1_tc_offset_div2,
+ filter_p,
+ filter_q);
+ }
+ }
+
+ pu1_src += 8;
+ u4_bs = u4_bs >> 4;
+ col++;
+ }
+
+ pu4_horz_bs += (64 == ctb_size) ? 2 : 1;
+ pu1_src -= ctb_size;
+ pu1_src += 8 * src_strd;
+
+ }
+ }
+}
diff --git a/decoder/ihevcd_deblk.h b/decoder/ihevcd_deblk.h
new file mode 100644
index 0000000..1c9f7c8
--- /dev/null
+++ b/decoder/ihevcd_deblk.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_deblk.h
+*
+* @brief
+*
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_DEBLK_H_
+#define _IHEVCD_DEBLK_H_
+
+void ihevcd_deblk_ctb(deblk_ctxt_t *ps_deblk,
+ WORD32 i4_is_last_ctb_x,
+ WORD32 i4_is_last_ctb_y);
+
+
+#endif /*_IHEVC_DEBLK_H_*/
diff --git a/decoder/ihevcd_debug.c b/decoder/ihevcd_debug.c
new file mode 100644
index 0000000..8e6a79f
--- /dev/null
+++ b/decoder/ihevcd_debug.c
@@ -0,0 +1,1090 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_debug.c
+*
+* @brief
+* Functions used for codec debugging
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_error.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_debug.h"
+
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#if DEBUG_CODE
+
+void ihevcd_debug_dump_mv_map(codec_t *ps_codec)
+{
+
+ process_ctxt_t *ps_proc;
+ sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+ WORD32 num_minpu_in_ctb, ctb_size, x, y, cur_pu_idx, cur_ctb_idx, pu_idx_start_ctb;
+ UWORD8 *pu1_pic_pu_map_ctb;
+ pu_t *ps_pu;
+ WORD32 i;
+ FILE *fp_mv_map, *fp_pu_idx_map, *fp_pu, *fp_mv_print, *fp_mv_print_1;
+ char l0_mvx[50], l0_mvy[50], l1_mvx[50], l1_mvy[50];
+ UWORD32 *pu4_pu_done, num_pu_done = 0, is_pu_done;
+
+ pu4_pu_done = malloc(MAX_HT * MAX_WD / 4 / 4 * sizeof(UWORD32));
+
+ ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+ num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+
+ ps_pu = &ps_codec->s_parse.ps_pic_pu[0];
+ fp_mv_map = fopen("d:\\dump\\mv_map.txt", "a");
+ fp_mv_print = fopen("d:\\dump\\mv_print.txt", "a");
+ fp_mv_print_1 = fopen("d:\\dump\\mv_print_1.txt", "a");
+ if((NULL == fp_mv_map) || (NULL == fp_mv_print) || (NULL == fp_mv_print_1))
+ {
+ printf("\n Couldn't open mv dump files");
+ }
+ else
+ {
+#if 0
+ fp_pu_idx_map = fopen("d:\\dump\\pu_idx_map.txt", "ab");
+ fp_pu = fopen("d:\\dump\\pu.txt", "ab");
+ {
+ WORD32 last_ctb_idx, last_pu_idx;
+ last_ctb_idx = ps_sps->i2_pic_ht_in_ctb * ps_sps->i2_pic_wd_in_ctb * num_minpu_in_ctb;
+ fwrite(ps_codec->s_parse.pu1_pic_pu_map,last_ctb_idx,sizeof(UWORD32),fp_pu_idx_map );
+ fwrite(ps_codec->s_parse.pu4_pic_pu_idx,last_ctb_idx * num_minpu_in_ctb, sizeof(UWORD8),fp_pu_idx_map );
+
+ last_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[last_ctb_idx];
+ fwrite(ps_codec->s_parse.ps_pic_pu,last_pu_idx , sizeof(pu_t),fp_pu );
+ }
+#endif
+ fprintf(fp_mv_map, "\nPOC=%d\n", ps_codec->ps_slice_hdr_base[0].i4_abs_pic_order_cnt);
+
+ {
+ WORD32 last_ctb_idx, last_ctb_pu_idx, last_pu_idx;
+ last_ctb_idx = ps_sps->i2_pic_ht_in_ctb * ps_sps->i2_pic_wd_in_ctb;
+ last_ctb_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[last_ctb_idx];
+
+ pu1_pic_pu_map_ctb = ps_codec->s_parse.pu1_pic_pu_map
+ + last_ctb_idx * num_minpu_in_ctb;
+
+ last_pu_idx = last_ctb_pu_idx + pu1_pic_pu_map_ctb[(((ps_sps->i2_pic_wd_in_ctb * ctb_size - 1) & (ctb_size - 1)) >> 2) + ((((ps_sps->i2_pic_ht_in_ctb * ctb_size - 1) & (ctb_size - 1))) >> 2) * (ctb_size >> 2)];
+
+ for(i = 0; i < last_pu_idx; i++)
+ {
+ ps_pu = &ps_codec->s_parse.ps_pic_pu[i];
+
+ fprintf(fp_mv_print_1, "\n-----------------------");
+
+ fprintf(fp_mv_print_1, "\n pu_x = %d, pu_y = %d",
+ (ps_pu->b4_pos_x << 2), (ps_pu->b4_pos_y << 2));
+ fprintf(fp_mv_print_1, "\n pu_wd = %d, pu_ht = %d", ((ps_pu->b4_wd + 1) << 2), ((ps_pu->b4_ht + 1) << 2));
+ if(ps_pu->b2_pred_mode == PRED_L0)
+ fprintf(fp_mv_print_1, "\n Pred = 0,Ref_idx = %d, MV l0 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.s_l0_mv.i2_mvx,
+ ps_pu->mv.s_l0_mv.i2_mvy);
+ else if(ps_pu->b2_pred_mode == PRED_L1)
+ fprintf(fp_mv_print_1, "\n Pred = 1,Ref_idx = %d, MV l1 = %4d %4d", ps_pu->mv.i1_l1_ref_idx, ps_pu->mv.s_l1_mv.i2_mvx,
+ ps_pu->mv.s_l1_mv.i2_mvy);
+ else
+ fprintf(fp_mv_print_1, "\n Pred = 2,Ref_idx = %d,Ref_idx = %d, MV l0 = %4d %4d, MV l1 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.i1_l1_ref_idx,
+ ps_pu->mv.s_l0_mv.i2_mvx, ps_pu->mv.s_l0_mv.i2_mvy,
+ ps_pu->mv.s_l1_mv.i2_mvx, ps_pu->mv.s_l1_mv.i2_mvy);
+ }
+ }
+ for(y = 0; y < (ps_sps->i2_pic_height_in_luma_samples / MIN_PU_SIZE); y++)
+ {
+ for(x = 0; x < (ps_sps->i2_pic_width_in_luma_samples / MIN_PU_SIZE); x++)
+ {
+ cur_ctb_idx = (x * MIN_PU_SIZE / ctb_size) + (y * MIN_PU_SIZE / ctb_size) * ps_sps->i2_pic_wd_in_ctb;
+ pu_idx_start_ctb = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+
+ pu1_pic_pu_map_ctb = ps_codec->s_parse.pu1_pic_pu_map
+ + cur_ctb_idx * num_minpu_in_ctb;
+
+ cur_pu_idx = pu_idx_start_ctb + pu1_pic_pu_map_ctb[(((x * 4) & (ctb_size - 1)) >> 2) + ((((y * 4) & (ctb_size - 1))) >> 2) * (ctb_size >> 2)];
+
+ ps_pu = &ps_codec->s_parse.ps_pic_pu[cur_pu_idx];
+
+ is_pu_done = 0;
+ for(i = 0; i < num_pu_done; i++)
+ {
+ if(pu4_pu_done[num_pu_done - i - 1] == cur_pu_idx)
+ {
+ is_pu_done = 1;
+ break;
+ }
+ }
+
+ if(is_pu_done)
+ {
+ fprintf(fp_mv_map, ",");
+ }
+ else
+ {
+ sprintf(l0_mvx, "%d", ps_pu->mv.s_l0_mv.i2_mvx);
+ sprintf(l0_mvy, "%d", ps_pu->mv.s_l0_mv.i2_mvy);
+ sprintf(l1_mvx, "%d", ps_pu->mv.s_l1_mv.i2_mvx);
+ sprintf(l1_mvy, "%d", ps_pu->mv.s_l1_mv.i2_mvy);
+ fprintf(fp_mv_map, "(%s:%s)(%s:%s),", l0_mvx, l0_mvy, l1_mvx, l1_mvy);
+
+ fprintf(fp_mv_print, "\n-----------------------");
+
+/*
+ printf("\n CTB X = %d, Y = %d",
+ (x*MIN_PU_SIZE / ctb_size), (y*MIN_PU_SIZE / ctb_size));
+*/
+
+ fprintf(fp_mv_print, "\n pu_x = %d, pu_y = %d",
+ (ps_pu->b4_pos_x << 2), (ps_pu->b4_pos_y << 2));
+ fprintf(fp_mv_print, "\n pu_wd = %d, pu_ht = %d", ((ps_pu->b4_wd + 1) << 2), ((ps_pu->b4_ht + 1) << 2));
+ if(ps_pu->b2_pred_mode == PRED_L0)
+ fprintf(fp_mv_print, "\n Pred = 0,Ref_idx = %d, MV l0 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.s_l0_mv.i2_mvx,
+ ps_pu->mv.s_l0_mv.i2_mvy);
+ else if(ps_pu->b2_pred_mode == PRED_L1)
+ fprintf(fp_mv_print, "\n Pred = 1,Ref_idx = %d, MV l1 = %4d %4d", ps_pu->mv.i1_l1_ref_idx, ps_pu->mv.s_l1_mv.i2_mvx,
+ ps_pu->mv.s_l1_mv.i2_mvy);
+ else
+ fprintf(fp_mv_print, "\n Pred = 2,Ref_idx = %d,Ref_idx = %d, MV l0 = %4d %4d, MV l1 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.i1_l1_ref_idx,
+ ps_pu->mv.s_l0_mv.i2_mvx, ps_pu->mv.s_l0_mv.i2_mvy,
+ ps_pu->mv.s_l1_mv.i2_mvx, ps_pu->mv.s_l1_mv.i2_mvy);
+
+ pu4_pu_done[num_pu_done] = cur_pu_idx;
+ num_pu_done++;
+ }
+ }
+ fprintf(fp_mv_map, "\n");
+ }
+ }
+ fclose(fp_mv_map);
+ fclose(fp_mv_print);
+ fclose(fp_mv_print_1);
+// fclose(fp_pu_idx_map);
+// fclose(fp_pu);
+ free(pu4_pu_done);
+}
+
+void ihevcd_debug_assert(WORD32 x)
+{
+ if(!x)
+ {
+ printf("Assert failed.. Exiting \n");
+ exit(-1);
+ }
+}
+
+void ihevcd_debug_dump_pic_buffers(codec_t *ps_codec)
+{
+ FILE *fp_pic, *fp_pic_b;
+ sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+ static WORD32 file_open = 0;
+ WORD32 vert_bs_size, horz_bs_size;
+ WORD32 qp_size;
+ WORD32 qp_const_flag_size;
+ WORD32 loop_filter_size;
+ WORD32 loop_filter_buffer;
+ WORD32 pic_intra_flag_size;
+
+ vert_bs_size = ps_codec->i4_max_wd / 8 + MAX_CTB_SIZE / 8;
+
+ /* Max Number of horizontal edges - extra MAX_CTB_SIZE / 8 to handle the last 4 rows separately(shifted CTB processing) */
+ vert_bs_size *= (ps_codec->i4_max_ht + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+ /* Number of bytes */
+ vert_bs_size /= 8;
+
+ /* Two bits per edge */
+ vert_bs_size *= 2;
+
+ /* Max Number of horizontal edges */
+ horz_bs_size = ps_codec->i4_max_ht / 8 + MAX_CTB_SIZE / 8;
+
+ /* Max Number of vertical edges - extra MAX_CTB_SIZE / 8 to handle the last 4 columns separately(shifted CTB processing) */
+ horz_bs_size *= (ps_codec->i4_max_wd + MAX_CTB_SIZE) / MIN_TU_SIZE;
+
+ /* Number of bytes */
+ horz_bs_size /= 8;
+
+ /* Two bits per edge */
+ horz_bs_size *= 2;
+
+ qp_size = (ps_codec->i4_max_ht * ps_codec->i4_max_wd) / (MIN_CU_SIZE * MIN_CU_SIZE);
+
+ /* Max CTBs in a row */
+ qp_const_flag_size = ps_codec->i4_max_wd / MIN_CTB_SIZE;
+
+ /* Max CTBs in a column */
+ qp_const_flag_size *= ps_codec->i4_max_ht / MIN_CTB_SIZE;
+
+ /* Number of bytes */
+ qp_const_flag_size /= 8;
+
+ loop_filter_size = ((ps_codec->i4_max_wd + 64) / MIN_CU_SIZE) * ((ps_codec->i4_max_ht + 64) / MIN_CU_SIZE) / 8;
+
+ loop_filter_buffer = (ps_codec->i4_max_wd + 63) >> 6;
+ loop_filter_buffer += 1;
+
+ loop_filter_size -= loop_filter_buffer;
+
+ pic_intra_flag_size = (ps_codec->i4_max_wd / MIN_CU_SIZE) * (ps_codec->i4_max_ht / MIN_CU_SIZE) / 8;
+
+ if(0 == file_open)
+ {
+ fp_pic = fopen("D:\\dump\\pic_dump.txt", "w");
+ fp_pic_b = fopen("D:\\dump\\pic_dump_b.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp_pic = fopen("D:\\dump\\pic_dump.txt", "a");
+ fp_pic_b = fopen("D:\\dump\\pic_dump_b.txt", "ab");
+ }
+
+ {
+ WORD32 i, j;
+
+ fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp, 1, qp_size, fp_pic_b);
+
+ fprintf(fp_pic, " Frame num :%d \n", ps_codec->u4_pic_cnt);
+
+ for(i = 0; i < ps_codec->i4_max_ht / MIN_CU_SIZE; i++)
+ {
+ for(j = 0; j < ps_codec->i4_max_wd / MIN_CU_SIZE; j++)
+ {
+ UWORD8 u1_qp;
+ WORD32 qp_strd;
+ qp_strd = ps_codec->i4_max_wd / MIN_CU_SIZE;
+ u1_qp = ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp[j + i * qp_strd];
+ fprintf(fp_pic, "%d \t", u1_qp);
+ }
+ fprintf(fp_pic, "\n");
+ }
+ }
+/*
+ fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs, 1, vert_bs_size, fp_pic);
+ fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs, 1, horz_bs_size, fp_pic);
+ fwrite(ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb, 1, qp_const_flag_size, fp_pic);
+ fwrite(ps_codec->s_parse.s_deblk_ctxt.pu1_pic_no_loop_filter_flag, 1, loop_filter_size, fp_pic);
+ fwrite(ps_codec->s_parse.pu1_pic_intra_flag, 1, pic_intra_flag_size, fp_pic);
+*/
+
+ //fwrite(au1_pic_avail_ctb_flags, 1, ps_sps->i2_pic_wd_in_ctb * ps_sps->i2_pic_ht_in_ctb, fp_pic);
+ //fwrite(au4_pic_ctb_slice_xy, 4, ps_sps->i2_pic_wd_in_ctb * ps_sps->i2_pic_ht_in_ctb, fp_pic);
+
+ fclose(fp_pic);
+ fclose(fp_pic_b);
+
+}
+
+
+void ihevcd_debug_dump_pic_pu(codec_t *ps_codec)
+{
+ FILE *fp_pic_pu;
+ FILE *fp_pic_pu_idx;
+ static WORD32 file_open = 0;
+ WORD32 num_pu_in_frame;
+ sps_t *ps_sps;
+
+ if(0 == file_open)
+ {
+ fp_pic_pu = fopen("D:\\dump\\pic_pu.txt", "wb");
+ fp_pic_pu_idx = fopen("D:\\dump\\pic_pu_idx.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ return;
+ fp_pic_pu = fopen("D:\\dump\\pic_pu.txt", "ab");
+ fp_pic_pu_idx = fopen("D:\\dump\\pic_pu_idx.txt", "ab");
+ }
+ ps_sps = ps_codec->s_parse.ps_sps;
+ num_pu_in_frame = ps_codec->s_parse.pu4_pic_pu_idx[ps_sps->i4_pic_size_in_ctb];
+
+ fwrite(ps_codec->s_parse.ps_pic_pu, sizeof(pu_t), num_pu_in_frame, fp_pic_pu);
+ fwrite(ps_codec->s_parse.pu4_pic_pu_idx, sizeof(UWORD32), ps_sps->i4_pic_size_in_ctb + 1, fp_pic_pu_idx);
+
+ fclose(fp_pic_pu);
+
+}
+
+
+void ihevcd_debug_init_tmp_buf(UWORD8 *pu1_buf_luma, UWORD8 *pu1_buf_chroma)
+{
+ memset(pu1_buf_luma, 0, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8));
+ memset(pu1_buf_chroma, 0, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE * sizeof(UWORD8));
+}
+
+void ihevcd_debug_process_tmp_buf(UWORD8 *pu1_buf_luma, UWORD8 *pu1_buf_chroma)
+{
+ WORD32 row, col;
+ UWORD8 *pu1_tmp_buf_luma;
+ UWORD8 *pu1_tmp_buf_chroma;
+
+ FILE *fp_luma, *fp_chroma;
+
+ pu1_tmp_buf_luma = (UWORD8 *)calloc(4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1);
+ pu1_tmp_buf_chroma = (UWORD8 *)calloc(4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1);
+
+ for(row = 0; row < 2 * MAX_CTB_SIZE; row++)
+ {
+ for(col = 0; col < 2 * MAX_CTB_SIZE; col++)
+ {
+ if(0 != pu1_buf_luma[row * 2 * MAX_CTB_SIZE + col])
+ pu1_tmp_buf_luma[row * 2 * MAX_CTB_SIZE + col] = 0xFF;
+ if(0 != pu1_buf_chroma[row * 2 * MAX_CTB_SIZE + col])
+ pu1_tmp_buf_chroma[row * 2 * MAX_CTB_SIZE + col] = 0xFF;
+ }
+ }
+
+ fp_luma = fopen("D:\\dump\\win_sao_tmp_buf_luma.yuv", "wb");
+ fp_chroma = fopen("D:\\dump\\win_sao_tmp_buf_chroma.yuv", "wb");
+
+ fwrite(pu1_tmp_buf_luma, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1, fp_luma);
+ fwrite(pu1_tmp_buf_chroma, 4 * MAX_CTB_SIZE * MAX_CTB_SIZE, 1, fp_chroma);
+
+ fclose(fp_luma);
+ fclose(fp_chroma);
+}
+
+void ihevcd_debug_print_struct_sizes()
+{
+ printf("sizeof(tu_t) %d\n", sizeof(tu_t));
+ printf("sizeof(pu_t) %d\n", sizeof(pu_t));
+ printf("sizeof(pu_mv_t) %d\n", sizeof(pu_mv_t));
+ printf("sizeof(vps_t) %d\n", sizeof(vps_t));
+ printf("sizeof(sps_t) %d\n", sizeof(sps_t));
+ printf("sizeof(pps_t) %d\n", sizeof(pps_t));
+ printf("sizeof(slice_header_t) %d\n", sizeof(slice_header_t));
+
+ printf("sizeof(codec_t) %d\n", sizeof(codec_t));
+ printf("sizeof(parse_ctxt_t) %d\n", sizeof(parse_ctxt_t));
+ printf("sizeof(process_ctxt_t) %d\n", sizeof(process_ctxt_t));
+ printf("sizeof(cab_ctxt_t) %d\n", sizeof(cab_ctxt_t));
+ return;
+}
+
+void ihevcd_debug_dump_pic(UWORD8 *pu1_cur_pic_luma,
+ UWORD8 *pu1_cur_pic_chroma,
+ WORD32 pic_wd,
+ WORD32 pic_ht,
+ WORD32 pic_strd)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+ WORD32 row;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_pre_ilf_dec_order.yuv", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_pre_ilf_dec_order.yuv", "ab");
+ }
+
+ for(row = 0; row < pic_ht; row++)
+ {
+ fwrite(pu1_cur_pic_luma, sizeof(UWORD8), pic_wd, fp);
+ pu1_cur_pic_luma += pic_strd;
+ }
+ for(row = 0; row < pic_ht / 2; row++)
+ {
+ fwrite(pu1_cur_pic_chroma, sizeof(UWORD8), pic_wd, fp);
+ pu1_cur_pic_chroma += pic_strd;
+ }
+
+ fclose(fp);
+}
+
+void ihevcd_debug_dump_bs(UWORD32 *pu4_pic_vert_bs,
+ UWORD32 *pu4_pic_horz_bs,
+ WORD32 vert_size_in_bytes,
+ WORD32 horz_size_in_bytes)
+{
+ FILE *fp_vert, *fp_horz;
+ static WORD32 files_open = 0;
+
+ if(files_open == 0)
+ {
+ fp_vert = fopen("D:\\dump\\win_vert_bs_dec_order.txt", "wb");
+ fp_horz = fopen("D:\\dump\\win_horz_bs_dec_order.txt", "wb");
+ files_open = 1;
+ }
+ else
+ {
+ fp_vert = fopen("D:\\dump\\win_vert_bs_dec_order.txt", "ab");
+ fp_horz = fopen("D:\\dump\\win_horz_bs_dec_order.txt", "ab");
+ }
+
+ fwrite(pu4_pic_vert_bs, sizeof(UWORD8), vert_size_in_bytes, fp_vert);
+ fwrite(pu4_pic_horz_bs, sizeof(UWORD8), horz_size_in_bytes, fp_horz);
+
+ fclose(fp_vert);
+ fclose(fp_horz);
+}
+
+void ihevcd_debug_dump_qp(UWORD8 *pu1_qp, WORD32 size_in_bytes)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_qp_dec_order.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_qp_dec_order.txt", "ab");
+ }
+
+ fwrite(pu1_qp, sizeof(UWORD8), size_in_bytes, fp);
+
+ fclose(fp);
+}
+
+void ihevcs_dump_qp_const_in_ctb(UWORD8 *pu1_qp_const_in_ctb, WORD32 size_in_bytes)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_qp_const_ctb_dec_order.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_qp_const_ctb_dec_order.txt", "ab");
+ }
+
+ fwrite(pu1_qp_const_in_ctb, sizeof(UWORD8), size_in_bytes, fp);
+
+ fclose(fp);
+}
+
+
+void ihevcd_debug_dump_no_loop_filter(UWORD8 *pu1_pic_no_loop_filter, WORD32 size_in_bytes)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_no_loop_filter_dec_order.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_no_loop_filter_dec_order.txt", "ab");
+ }
+
+ fwrite(pu1_pic_no_loop_filter, sizeof(UWORD8), size_in_bytes, fp);
+
+ fclose(fp);
+}
+
+void ihevcd_debug_dump_offsets(WORD32 beta_offset_div_2, WORD32 tc_offset_div_2, WORD32 qp_offset_u, WORD32 qp_offset_v)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_offsets.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_offsets.txt", "ab");
+ }
+
+ fwrite(&beta_offset_div_2, sizeof(WORD32), 1, fp);
+ fwrite(&tc_offset_div_2, sizeof(WORD32), 1, fp);
+ fwrite(&qp_offset_u, sizeof(WORD32), 1, fp);
+ fwrite(&qp_offset_v, sizeof(WORD32), 1, fp);
+
+ fclose(fp);
+
+}
+
+/* Debugging POC values */
+void ihevcd_debug_print_ref_list_pocs(WORD32 i4_pic_order_cnt_val,
+ slice_header_t *ps_slice_hdr,
+ dpb_mgr_t *ps_dpb_mgr,
+ UWORD32 u4_num_st_curr_before,
+ UWORD32 u4_num_st_curr_after,
+ UWORD32 u4_num_st_foll,
+ UWORD32 u4_num_lt_curr,
+ UWORD32 u4_num_lt_foll,
+ WORD32 *pi4_poc_st_curr_before,
+ WORD32 *pi4_poc_st_curr_after,
+ WORD32 *pi4_poc_st_foll,
+ WORD32 *pi4_poc_lt_curr,
+ WORD32 *pi4_poc_lt_foll)
+{
+ WORD32 i, j;
+ pic_buf_t *ps_pic_buf;
+ printf("\n------------------------\nCurrent POC: %d\n", i4_pic_order_cnt_val);
+ printf("\nPOCs present in Reference List L0:\n");
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ ps_pic_buf = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf));
+ printf("POC: %d\n", ps_pic_buf->i4_abs_poc);
+ printf("Longterm Reference = %d\n", ps_pic_buf->u1_used_as_ref);
+ }
+
+ if(ps_slice_hdr->i1_slice_type == BSLICE)
+ {
+ printf("\nPOCs present in Reference List L1:\n");
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ ps_pic_buf = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf));
+ printf("POC: %d\n", ps_pic_buf->i4_abs_poc);
+ printf("POC LSB: %d\n", ps_pic_buf->i4_poc_lsb);
+ printf("Longterm Reference = %d\n", ps_pic_buf->u1_used_as_ref);
+ }
+ }
+
+ printf("\nPOCs that are to be released from DPB:\n");
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+ {
+ WORD32 poc_found = 0;
+ ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+
+ for(j = 0; j < u4_num_st_curr_before && 0 == poc_found; j++)
+ {
+ if(ps_pic_buf->i4_abs_poc == pi4_poc_st_curr_before[j])
+ {
+ poc_found++;
+ break;
+ }
+ }
+ for(j = 0; j < u4_num_st_curr_after && 0 == poc_found; j++)
+ {
+ if(ps_pic_buf->i4_abs_poc == pi4_poc_st_curr_after[j])
+ {
+ poc_found++;
+ break;
+ }
+ }
+ for(j = 0; j < u4_num_st_foll && 0 == poc_found; j++)
+ {
+ if(ps_pic_buf->i4_abs_poc == pi4_poc_st_foll[j])
+ {
+ poc_found++;
+ break;
+ }
+ }
+ for(j = 0; j < u4_num_lt_curr && 0 == poc_found; j++)
+ {
+ if(ps_pic_buf->i4_abs_poc == pi4_poc_lt_curr[j])
+ {
+ poc_found++;
+ break;
+ }
+ }
+ for(j = 0; j < u4_num_lt_foll && 0 == poc_found; j++)
+ {
+ if(ps_pic_buf->i4_abs_poc == pi4_poc_lt_foll[j])
+ {
+ poc_found++;
+ break;
+ }
+ }
+
+ if(0 == poc_found)
+ printf("POC: %d\n", ps_pic_buf->i4_abs_poc);
+ }
+ }
+}
+
+void ihevcd_debug_validate_padded_region(process_ctxt_t *ps_proc)
+{
+ sps_t *ps_sps;
+ codec_t *ps_codec;
+ UWORD8 *pu1_src;
+ UWORD16 *pu2_src;
+ UWORD8 *pu1_validate;
+ UWORD16 *pu2_validate;
+ WORD32 i, j;
+ WORD32 pic_ht, pic_wd;
+ WORD32 src_strd;
+
+ FILE *fp;
+ static WORD32 file_open = 0;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\debug_padding.yuv", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\debug_padding.yuv", "ab");
+ }
+
+
+ if(NULL == fp)
+ {
+ printf("\nCannot Open file\n\n");
+ return;
+ }
+
+ /* pu2_src and pu2_validate are for chroma */
+
+ ps_sps = ps_proc->ps_sps;
+ ps_codec = ps_proc->ps_codec;
+ pu1_src = ps_proc->pu1_cur_pic_luma;
+ pu2_src = (UWORD16 *)ps_proc->pu1_cur_pic_chroma;
+ pic_ht = ps_sps->i2_pic_height_in_luma_samples;
+ pic_wd = ps_sps->i2_pic_width_in_luma_samples;
+ src_strd = ps_codec->i4_strd;
+
+ pu1_validate = (UWORD8 *)calloc((pic_wd + PAD_LEFT + PAD_RIGHT) * (pic_ht + PAD_TOP + PAD_BOT) * 3 / 2, 1);
+ pu2_validate = (UWORD16 *)(pu1_validate + (pic_wd + PAD_LEFT + PAD_RIGHT) * (pic_ht + PAD_TOP + PAD_BOT));
+
+ for(i = 0; i < pic_ht; i++)
+ {
+ for(j = 0; j < PAD_LEFT; j++)
+ {
+ if(pu1_src[j - PAD_LEFT] != pu1_src[0])
+ {
+ pu1_validate[j + (PAD_TOP + i) * src_strd] = 255;
+ }
+ }
+
+ for(j = 0; j < PAD_RIGHT; j++)
+ {
+ if(pu1_src[pic_wd + j] != pu1_src[pic_wd - 1])
+ {
+ pu1_validate[pic_wd + j + PAD_LEFT + (PAD_TOP + i) * src_strd] = 255;
+ }
+ }
+
+ pu1_src += src_strd;
+ }
+
+ pu1_src = ps_proc->pu1_cur_pic_luma - PAD_LEFT;
+ for(i = 0; i < pic_wd + PAD_LEFT + PAD_RIGHT; i++)
+ {
+ for(j = 0; j < PAD_TOP; j++)
+ {
+ if(pu1_src[(j - PAD_TOP) * src_strd] != pu1_src[0])
+ {
+ pu1_validate[i + j * src_strd] = 255;
+ }
+ }
+
+ for(j = 0; j < PAD_BOT; j++)
+ {
+ if(pu1_src[(pic_ht + j) * src_strd] != pu1_src[(pic_ht - 1) * src_strd])
+ {
+ pu1_validate[i + (j + pic_ht + PAD_TOP) * src_strd] = 255;
+ }
+ }
+
+ pu1_src += 1;
+ }
+
+ for(i = 0; i < pic_ht / 2; i++)
+ {
+ for(j = 0; j < PAD_LEFT / 2; j++)
+ {
+ if(pu2_src[j - PAD_LEFT / 2] != pu2_src[0])
+ {
+ pu2_validate[j + (PAD_TOP / 2 + i) * src_strd / 2] = 0xFFFF;
+ }
+ }
+
+ for(j = 0; j < PAD_RIGHT / 2; j++)
+ {
+ if(pu2_src[pic_wd / 2 + j] != pu2_src[pic_wd / 2 - 1])
+ {
+ pu2_validate[pic_wd / 2 + j + PAD_LEFT / 2 + (PAD_TOP / 2 + i) * src_strd / 2] = 0xFFFF;
+ }
+ }
+
+ pu2_src += src_strd / 2;
+ }
+ fwrite(pu1_validate, 1, (pic_wd + PAD_LEFT + PAD_RIGHT) * (pic_ht + PAD_TOP + PAD_BOT) * 3 / 2, fp);
+
+ free(pu1_validate);
+ fclose(fp);
+}
+
+void ihevcd_debug_print_nal_info(codec_t *ps_codec, WORD32 nal_type)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+ slice_header_t *ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+ WORD32 frame_start_flag = 0;
+ WORD32 frame_poc = 0;
+
+ if(0 == file_open)
+ {
+ fp = fopen("nal_info.txt", "w");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("nal_info.txt", "a");
+ }
+
+ if(NULL == fp)
+ {
+ printf("Cannot open NAL info file.. Exiting\n");
+ exit(-1);
+ }
+
+ /* If slice NAL, update start of frame flag */
+ switch(nal_type)
+ {
+ case NAL_BLA_W_LP :
+ case NAL_BLA_W_DLP :
+ case NAL_BLA_N_LP :
+ case NAL_IDR_W_LP :
+ case NAL_IDR_N_LP :
+ case NAL_CRA :
+ case NAL_TRAIL_N :
+ case NAL_TRAIL_R :
+ case NAL_TSA_N :
+ case NAL_TSA_R :
+ case NAL_STSA_N :
+ case NAL_STSA_R :
+ case NAL_RADL_N :
+ case NAL_RADL_R :
+ case NAL_RASL_N :
+ case NAL_RASL_R :
+ frame_start_flag = ps_slice_hdr->i1_first_slice_in_pic_flag;
+ frame_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+ ps_codec->i4_first_pic_done = 1;
+ break;
+
+ default:
+ frame_start_flag = 0;
+ frame_poc = 0;
+ break;
+ }
+ fprintf(fp, "NALType=%d;NumBytes=%d;POC=%d;FrameStart=%d\n",
+ nal_type,
+ ps_codec->i4_nal_ofst + ps_codec->i4_nal_len,
+ frame_poc,
+ frame_start_flag);
+
+ fclose(fp);
+}
+
+typedef struct
+{
+ UWORD8 au1_src[8 * 4];
+ WORD32 src_strd;
+ WORD32 bs;
+ WORD32 qp_p;
+ WORD32 qp_q;
+ WORD32 beta_offset_div_2;
+ WORD32 tc_offset_div_2;
+ WORD32 filter_p;
+ WORD32 filter_q;
+}deblk_luma_t;
+
+typedef struct
+{
+ UWORD8 au1_src[8 * 4];
+ WORD32 src_strd;
+ WORD32 bs;
+ WORD32 qp_p;
+ WORD32 qp_q;
+ WORD32 qp_offset_u;
+ WORD32 qp_offset_v;
+ WORD32 tc_offset_div_2;
+ WORD32 filter_p;
+ WORD32 filter_q;
+}deblk_chroma_t;
+
+
+void ihevcd_debug_deblk_luma_vert(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+ WORD32 row, col;
+ deblk_luma_t s_deblk_luma;
+
+ pu1_src -= 4;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_deblk_luma_vert.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_deblk_luma_vert.txt", "ab");
+ }
+
+ for(row = 0; row < 4; row++)
+ {
+ for(col = 0; col < 8; col++)
+ {
+ s_deblk_luma.au1_src[row * 8 + col] = pu1_src[row * src_strd + col];
+ }
+ }
+ s_deblk_luma.src_strd = src_strd;
+ s_deblk_luma.bs = bs;
+ s_deblk_luma.qp_p = quant_param_p;
+ s_deblk_luma.qp_q = quant_param_q;
+ s_deblk_luma.beta_offset_div_2 = beta_offset_div2;
+ s_deblk_luma.tc_offset_div_2 = tc_offset_div2;
+ s_deblk_luma.filter_p = filter_flag_p;
+ s_deblk_luma.filter_q = filter_flag_q;
+
+ fwrite(&s_deblk_luma, sizeof(deblk_luma_t), 1, fp);
+
+ fclose(fp);
+}
+
+void ihevcd_debug_deblk_luma_horz(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 beta_offset_div2,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+ WORD32 row, col;
+ deblk_luma_t s_deblk_luma;
+
+ pu1_src -= 4 * src_strd;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_deblk_luma_horz.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_deblk_luma_horz.txt", "ab");
+ }
+
+ for(row = 0; row < 8; row++)
+ {
+ for(col = 0; col < 4; col++)
+ {
+ s_deblk_luma.au1_src[row * 4 + col] = pu1_src[row * src_strd + col];
+ }
+ }
+ s_deblk_luma.src_strd = src_strd;
+ s_deblk_luma.bs = bs;
+ s_deblk_luma.qp_p = quant_param_p;
+ s_deblk_luma.qp_q = quant_param_q;
+ s_deblk_luma.beta_offset_div_2 = beta_offset_div2;
+ s_deblk_luma.tc_offset_div_2 = tc_offset_div2;
+ s_deblk_luma.filter_p = filter_flag_p;
+ s_deblk_luma.filter_q = filter_flag_q;
+
+ fwrite(&s_deblk_luma, sizeof(deblk_luma_t), 1, fp);
+
+ fclose(fp);
+}
+
+void ihevcd_debug_deblk_chroma_vert(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+ WORD32 row, col;
+ deblk_chroma_t s_deblk_chroma;
+
+ pu1_src -= 4;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_deblk_chroma_vert.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_deblk_chroma_vert.txt", "ab");
+ }
+
+ for(row = 0; row < 4; row++)
+ {
+ for(col = 0; col < 8; col++)
+ {
+ s_deblk_chroma.au1_src[row * 8 + col] = pu1_src[row * src_strd + col];
+ }
+ }
+ s_deblk_chroma.src_strd = src_strd;
+ s_deblk_chroma.bs = bs;
+ s_deblk_chroma.qp_p = quant_param_p;
+ s_deblk_chroma.qp_q = quant_param_q;
+ s_deblk_chroma.qp_offset_u = qp_offset_u;
+ s_deblk_chroma.qp_offset_v = qp_offset_v;
+ s_deblk_chroma.tc_offset_div_2 = tc_offset_div2;
+ s_deblk_chroma.filter_p = filter_flag_p;
+ s_deblk_chroma.filter_q = filter_flag_q;
+
+ fwrite(&s_deblk_chroma, sizeof(deblk_chroma_t), 1, fp);
+
+ fclose(fp);
+}
+
+
+void ihevcd_debug_deblk_chroma_horz(UWORD8 *pu1_src,
+ WORD32 src_strd,
+ WORD32 bs,
+ WORD32 quant_param_p,
+ WORD32 quant_param_q,
+ WORD32 qp_offset_u,
+ WORD32 qp_offset_v,
+ WORD32 tc_offset_div2,
+ WORD32 filter_flag_p,
+ WORD32 filter_flag_q)
+{
+ FILE *fp;
+ static WORD32 file_open = 0;
+ WORD32 row, col;
+ deblk_chroma_t s_deblk_chroma;
+
+ pu1_src -= 2 * src_strd;
+
+ if(file_open == 0)
+ {
+ fp = fopen("D:\\dump\\win_deblk_chroma_horz.txt", "wb");
+ file_open = 1;
+ }
+ else
+ {
+ fp = fopen("D:\\dump\\win_deblk_chroma_horz.txt", "ab");
+ }
+
+ for(row = 0; row < 4; row++)
+ {
+ for(col = 0; col < 8; col++)
+ {
+ s_deblk_chroma.au1_src[row * 8 + col] = pu1_src[row * src_strd + col];
+ }
+ }
+ s_deblk_chroma.src_strd = src_strd;
+ s_deblk_chroma.bs = bs;
+ s_deblk_chroma.qp_p = quant_param_p;
+ s_deblk_chroma.qp_q = quant_param_q;
+ s_deblk_chroma.qp_offset_u = qp_offset_u;
+ s_deblk_chroma.qp_offset_v = qp_offset_v;
+ s_deblk_chroma.tc_offset_div_2 = tc_offset_div2;
+ s_deblk_chroma.filter_p = filter_flag_p;
+ s_deblk_chroma.filter_q = filter_flag_q;
+
+ fwrite(&s_deblk_chroma, sizeof(deblk_chroma_t), 1, fp);
+
+ fclose(fp);
+}
+
+#if DEBUG_PRINT_IQ_IT_RECON
+void print_coeff(WORD16 *pi2_tu_coeff, WORD32 trans_size)
+{
+ WORD32 row, col;
+ for(row = 0; row < trans_size; row++)
+ {
+ for(col = 0; col < trans_size; col++)
+ {
+ printf("%d\t", pi2_tu_coeff[row * trans_size + col]);
+ }
+ printf("\n");
+ }
+}
+
+void print_dst(UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 trans_size,
+ WORD32 is_luma)
+{
+ WORD32 row, col;
+ WORD32 inc;
+ inc = is_luma == 1 ? 1 : 2;
+
+ for(row = 0; row < trans_size; row++)
+ {
+ for(col = 0; col < trans_size; col++)
+ {
+ printf("%d\t", pu1_dst[row * dst_strd + inc * col]);
+ }
+ printf("\n");
+ }
+}
+#endif
+#endif
diff --git a/decoder/ihevcd_debug.h b/decoder/ihevcd_debug.h
new file mode 100644
index 0000000..af6a0d0
--- /dev/null
+++ b/decoder/ihevcd_debug.h
@@ -0,0 +1,176 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_debug.h
+*
+* @brief
+* Debug defs
+*
+* @author
+* Naveen S R
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_DEBUG_H_
+#define _IHEVCD_DEBUG_H_
+
+#define DEBUG_REF_LIST 0
+#define DEBUG_PADDED_REGION 0
+#define DEBUG_DUMP_PRE_ILF 0
+#define DEBUG_PRINT_IQ_IT_RECON 0
+#define DEBUG_PRINT_MV 0
+#define DEBUG_DEBLK_LEAF_LEVEL 0
+#define DEBUG_NAL_TYPE 0
+#define DEBUG_SAO_TMP_BUF 0
+#define DEBUG_BREAK_AFTER_SLICE_NAL 0
+#define DEBUG_DUMP_FRAME_BUFFERS_INFO 0
+#define DEBUG_DUMP_FRAME_PU_INFO 0
+#define DEBUG_MV_MAP 0
+
+#if (DEBUG_REF_LIST||DEBUG_PADDED_REGION||DEBUG_DUMP_PRE_ILF||DEBUG_PRINT_IQ_IT_RECON||DEBUG_PRINT_MV||DEBUG_DEBLK_LEAF_LEVEL||DEBUG_NAL_TYPE||DEBUG_SAO_TMP_BUF||DEBUG_BREAK_AFTER_SLICE_NAL || DEBUG_DUMP_FRAME_BUFFERS_INFO || DEBUG_DUMP_FRAME_PU_INFO)
+#define DEBUG_CODE 1
+#else
+#define DEBUG_CODE 0
+#endif
+
+
+#if DEBUG_DUMP_FRAME_PU_INFO
+#define DEBUG_DUMP_PIC_PU(ps_codec) ihevcd_debug_dump_pic_pu(ps_codec);
+#else
+#define DEBUG_DUMP_PIC_PU(ps_codec)
+#endif
+
+
+#if DEBUG_DUMP_FRAME_BUFFERS_INFO
+UWORD8 au1_pic_avail_ctb_flags[MAX_WD * MAX_HT / MIN_CTB_SIZE / MIN_CTB_SIZE];
+UWORD32 au4_pic_ctb_slice_xy[MAX_WD * MAX_HT / MIN_CTB_SIZE / MIN_CTB_SIZE];
+
+#define DEBUG_DUMP_PIC_BUFFERS(ps_codec) ihevcd_debug_dump_pic_buffers(ps_codec);
+#else
+#define DEBUG_DUMP_PIC_BUFFERS(ps_codec)
+#endif
+
+#if DEBUG_BREAK_AFTER_SLICE_NAL
+#define BREAK_AFTER_SLICE_NAL() \
+ if(ps_codec->i4_header_done) \
+ break;
+#else
+#define BREAK_AFTER_SLICE_NAL() ;
+#endif
+
+
+#if DEBUG_SAO_TMP_BUF
+#define DEBUG_INIT_TMP_BUF(pu1_buf_luma, pu1_buf_chroma) ihevcd_debug_init_tmp_buf(pu1_buf_luma, pu1_buf_chroma)
+#define DEBUG_PROCESS_TMP_BUF(pu1_buf_luma, pu1_buf_chroma) ihevcd_debug_process_tmp_buf(pu1_buf_luma, pu1_buf_chroma)
+#else
+#define DEBUG_INIT_TMP_BUF(pu1_buf_luma, pu1_buf_chroma)
+#define DEBUG_PROCESS_TMP_BUF(pu1_buf_luma, pu1_buf_chroma)
+#endif
+
+#if DEBUG_NAL_TYPE
+
+#define DEBUG_PRINT_NAL_INFO(ps_codec, nal_type) ihevcd_debug_print_nal_info(ps_codec, nal_type); \
+ break;
+#define RETURN_IF_NAL_INFO return;
+
+#else
+
+#define DEBUG_PRINT_NAL_INFO(ps_codec, nal_type)
+#define RETURN_IF_NAL_INFO
+
+#endif
+
+#if DEBUG_REF_LIST
+
+#define DEBUG_PRINT_REF_LIST_POCS(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll) \
+ ihevcd_debug_print_ref_list_pocs(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll);
+
+#else
+
+#define DEBUG_PRINT_REF_LIST_POCS(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll)
+
+#endif
+
+#if DEBUG_PADDED_REGION
+
+#define DEBUG_VALIDATE_PADDED_REGION(ps_proc) ihevcd_debug_validate_padded_region(ps_proc);
+
+#else
+
+#define DEBUG_VALIDATE_PADDED_REGION(ps_proc)
+
+#endif
+
+#if DEBUG_DUMP_PRE_ILF
+
+#define DUMP_PRE_ILF(pu1_cur_pic_luma, pu1_cur_pic_chroma, pic_wd, pic_ht, pic_strd) ihevcd_debug_dump_pic(pu1_cur_pic_luma, pu1_cur_pic_chroma, pic_wd, pic_ht, pic_strd)
+#define DUMP_BS(pu4_pic_vert_bs, pu4_pic_horz_bs, vert_size_in_bytes, horz_size_in_bytes) ihevcd_debug_dump_bs(pu4_pic_vert_bs, pu4_pic_horz_bs, vert_size_in_bytes, horz_size_in_bytes)
+#define DUMP_QP(pu1_qp, size_in_bytes) ihevcd_debug_dump_qp(pu1_qp, size_in_bytes)
+#define DUMP_QP_CONST_IN_CTB(pu1_qp_const_in_ctb, size_in_bytes) ihevcs_dump_qp_const_in_ctb(pu1_qp_const_in_ctb, size_in_bytes)
+#define DUMP_NO_LOOP_FILTER(pu1_pic_no_loop_filter, size_in_bytes) ihevcd_debug_dump_no_loop_filter(pu1_pic_no_loop_filter, size_in_bytes)
+#define DUMP_OFFSETS(beta_offset_div_2, tc_offset_div_2, qp_offset_u, qp_offset_v) ihevcd_debug_dump_offsets(beta_offset_div_2, tc_offset_div_2, qp_offset_u, qp_offset_v)
+
+#else
+
+#define DUMP_PRE_ILF(pu1_cur_pic_luma, pu1_cur_pic_chroma, pic_wd, pic_ht, pic_strd)
+#define DUMP_BS(pu4_pic_vert_bs, pu4_pic_horz_bs, vert_size_in_bytes, horz_size_in_bytes)
+#define DUMP_QP(pu1_qp, size_in_bytes)
+#define DUMP_QP_CONST_IN_CTB(pu1_qp_const_in_ctb, size_in_bytes)
+#define DUMP_NO_LOOP_FILTER(pu1_pic_no_loop_filter, size_in_bytes)
+#define DUMP_OFFSETS(beta_offset_div_2, tc_offset_div_2, qp_offset_u, qp_offset_v)
+
+#endif
+
+
+#if DEBUG_DEBLK_LEAF_LEVEL
+
+#define DUMP_DEBLK_LUMA_VERT(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_luma_vert(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q);
+#define DUMP_DEBLK_LUMA_HORZ(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_luma_horz(pu1_src, src_strd, u4_bs, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q);
+#define DUMP_DEBLK_CHROMA_VERT(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_chroma_vert(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_CHROMA_HORZ(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q) ihevcd_debug_deblk_chroma_horz(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+
+#else
+
+#define DUMP_DEBLK_LUMA_VERT(pu1_src, src_strd, u4_bs3, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_LUMA_HORZ(pu1_src, src_strd, u4_bs3, qp_p, qp_q, beta_offset_div2, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_CHROMA_VERT(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+#define DUMP_DEBLK_CHROMA_HORZ(pu1_src, src_strd, u4_bs, qp_p, qp_q, qp_offset_u, qp_offset_v, tc_offset_div2, filter_p, filter_q)
+
+#endif
+
+#if DEBUG_MV_MAP
+#define DEBUG_DUMP_MV_MAP(ps_codec) ihevcd_debug_dump_mv_map(ps_codec);
+#else
+#define DEBUG_DUMP_MV_MAP(ps_codec)
+#endif
+void print_coeff(WORD16 *pi2_tu_coeff, WORD32 trans_size);
+
+void print_dst(UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 trans_size,
+ WORD32 is_luma);
+
+#endif /* _IHEVCD_DEBUG_H_ */
diff --git a/decoder/ihevcd_decode.c b/decoder/ihevcd_decode.c
new file mode 100644
index 0000000..b2a834a
--- /dev/null
+++ b/decoder/ihevcd_decode.c
@@ -0,0 +1,859 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_decode.c
+ *
+ * @brief
+ * Contains codecs main decode function
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ * - fill_outargs()
+ * - ihevcd_decode
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_process_slice.h"
+#include "ihevcd_ittiam_logo.h"
+#include "ihevcd_profile.h"
+
+#define NUM_FRAMES_LIMIT_ENABLED 0
+
+#if NUM_FRAMES_LIMIT_ENABLED
+#define NUM_FRAMES_LIMIT 3600
+#else
+#define NUM_FRAMES_LIMIT 0x7FFFFFFF
+#endif
+
+IHEVCD_ERROR_T ihevcd_fmt_conv(codec_t *ps_codec,
+ process_ctxt_t *ps_proc,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_u_dst,
+ UWORD8 *pu1_v_dst,
+ WORD32 cur_row,
+ WORD32 num_rows);
+WORD32 ihevcd_init(codec_t *ps_codec);
+/*****************************************************************************/
+/* Function Prototypes */
+/*****************************************************************************/
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief Fills output arguments for decode process
+ *
+ * @par Description
+ * Fills elements in the output structure based on the current state
+ *
+ * @param[in] ps_codec
+ * Codec context
+ *
+ * @param[in] ps_dec_ip
+ * Pointer to input structure
+ *
+ * @param[in] ps_dec_op
+ * Pointer to output structure
+ *
+ * @returns none
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+static UWORD32 ihevcd_map_error(IHEVCD_ERROR_T e_error)
+{
+ UWORD32 error_code = 0;
+ error_code = e_error;
+ switch(error_code)
+ {
+ case IHEVCD_SUCCESS :
+ break;
+ case IHEVCD_INIT_NOT_DONE:
+ case IHEVCD_LEVEL_UNSUPPORTED:
+ case IHEVCD_NUM_REF_UNSUPPORTED:
+ case IHEVCD_NUM_REORDER_UNSUPPORTED:
+ case IHEVCD_NUM_EXTRA_DISP_UNSUPPORTED:
+ case IHEVCD_INSUFFICIENT_MEM_MVBANK:
+ case IHEVCD_INSUFFICIENT_MEM_PICBUF:
+ error_code |= 1 << IVD_FATALERROR;
+ break;
+ case IHEVCD_INVALID_DISP_STRD:
+ case IHEVCD_CXA_VERS_BUF_INSUFFICIENT:
+ case IHEVCD_UNSUPPORTED_VPS_ID:
+ case IHEVCD_UNSUPPORTED_SPS_ID:
+ case IHEVCD_UNSUPPORTED_PPS_ID:
+ case IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC:
+ case IHEVCD_UNSUPPORTED_BIT_DEPTH:
+ case IHEVCD_BUF_MGR_ERROR:
+ case IHEVCD_NO_FREE_MVBANK:
+ case IHEVCD_NO_FREE_PICBUF:
+ case IHEVCD_SLICE_IN_HEADER_MODE:
+ case IHEVCD_END_OF_SEQUENCE:
+ break;
+ default:
+ break;
+ }
+ return error_code;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief Fills output arguments for decode process
+ *
+ * @par Description
+ * Fills elements in the output structure based on the current state
+ *
+ * @param[in] ps_codec
+ * Codec context
+ *
+ * @param[in] ps_dec_ip
+ * Pointer to input structure
+ *
+ * @param[in] ps_dec_op
+ * Pointer to output structure
+ *
+ * @returns none
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+static void ihevcd_fill_outargs(codec_t *ps_codec,
+ ivd_video_decode_ip_t *ps_dec_ip,
+ ivd_video_decode_op_t *ps_dec_op)
+{
+
+ ps_dec_op->u4_error_code = ihevcd_map_error((IHEVCD_ERROR_T)ps_codec->i4_error_code);
+ ps_dec_op->u4_num_bytes_consumed = ps_dec_ip->u4_num_Bytes
+ - ps_codec->i4_bytes_remaining;
+ if(ps_codec->i4_sps_done)
+ {
+ ps_dec_op->u4_pic_wd = ps_codec->i4_disp_wd;
+ ps_dec_op->u4_pic_ht = ps_codec->i4_disp_ht;
+ }
+ else
+ {
+ ps_dec_op->u4_pic_wd = 0;
+ ps_dec_op->u4_pic_ht = 0;
+ }
+
+ ps_dec_op->e_pic_type = ps_codec->e_dec_pic_type;
+ ps_dec_op->u4_frame_decoded_flag = ps_codec->i4_pic_present;
+ ps_dec_op->u4_new_seq = 0;
+
+ ps_dec_op->u4_output_present = 0;
+ ps_dec_op->u4_progressive_frame_flag = 1;
+ ps_dec_op->u4_is_ref_flag = 1;
+ ps_dec_op->e_output_format = ps_codec->e_chroma_fmt;
+ ps_dec_op->u4_is_ref_flag = 1;
+
+ ps_dec_op->e4_fld_type = IV_FLD_TYPE_DEFAULT;
+
+ ps_dec_op->u4_ts = (UWORD32)(-1);
+ ps_dec_op->u4_disp_buf_id = ps_codec->i4_disp_buf_id;
+ if(ps_codec->i4_flush_mode)
+ {
+ ps_dec_op->u4_num_bytes_consumed = 0;
+ /*In the case of flush ,since no frame is decoded set pic type as invalid*/
+ ps_dec_op->u4_is_ref_flag = 0;
+ ps_dec_op->e_pic_type = IV_NA_FRAME;
+ ps_dec_op->u4_frame_decoded_flag = 0;
+
+ }
+ /* If there is a display buffer */
+ if(ps_codec->ps_disp_buf)
+ {
+ pic_buf_t *ps_disp_buf = ps_codec->ps_disp_buf;
+
+ ps_dec_op->u4_output_present = 1;
+ PROFILE_DIS_PROCESS_CTB_SET_NOOUTPUT();
+ ps_dec_op->u4_ts = ps_disp_buf->u4_ts;
+ if((ps_codec->i4_flush_mode == 0) && (ps_codec->s_parse.i4_end_of_frame == 0))
+ ps_dec_op->u4_output_present = 0;
+ ps_dec_op->s_disp_frm_buf.u4_y_wd = ps_codec->i4_disp_wd;
+ ps_dec_op->s_disp_frm_buf.u4_y_ht = ps_codec->i4_disp_ht;
+
+ if(ps_codec->i4_share_disp_buf)
+ {
+ ps_dec_op->s_disp_frm_buf.pv_y_buf = ps_disp_buf->pu1_luma;
+ if(ps_codec->e_chroma_fmt != IV_YUV_420P)
+ {
+ ps_dec_op->s_disp_frm_buf.pv_u_buf = ps_disp_buf->pu1_chroma;
+ ps_dec_op->s_disp_frm_buf.pv_v_buf = NULL;
+ }
+ else
+ {
+ ps_dec_op->s_disp_frm_buf.pv_u_buf =
+ ps_dec_ip->s_out_buffer.pu1_bufs[1];
+ ps_dec_op->s_disp_frm_buf.pv_v_buf =
+ ps_dec_ip->s_out_buffer.pu1_bufs[2];
+
+ }
+ ps_dec_op->s_disp_frm_buf.u4_y_strd = ps_codec->i4_strd;
+
+ }
+ else
+ {
+ ps_dec_op->s_disp_frm_buf.pv_y_buf =
+ ps_dec_ip->s_out_buffer.pu1_bufs[0];
+ ps_dec_op->s_disp_frm_buf.pv_u_buf =
+ ps_dec_ip->s_out_buffer.pu1_bufs[1];
+ ps_dec_op->s_disp_frm_buf.pv_v_buf =
+ ps_dec_ip->s_out_buffer.pu1_bufs[2];
+ ps_dec_op->s_disp_frm_buf.u4_y_strd = ps_codec->i4_disp_strd;
+ }
+
+ if((IV_YUV_420SP_VU == ps_codec->e_chroma_fmt)
+ || (IV_YUV_420SP_UV == ps_codec->e_chroma_fmt))
+ {
+ ps_dec_op->s_disp_frm_buf.u4_u_strd =
+ ps_dec_op->s_disp_frm_buf.u4_y_strd;
+ ps_dec_op->s_disp_frm_buf.u4_v_strd = 0;
+ ps_dec_op->s_disp_frm_buf.u4_u_wd =
+ ps_dec_op->s_disp_frm_buf.u4_y_wd;
+ ps_dec_op->s_disp_frm_buf.u4_v_wd = 0;
+ ps_dec_op->s_disp_frm_buf.u4_u_ht =
+ ps_dec_op->s_disp_frm_buf.u4_y_ht / 2;
+ ps_dec_op->s_disp_frm_buf.u4_v_ht = 0;
+ }
+ else if(IV_YUV_420P == ps_codec->e_chroma_fmt)
+ {
+ ps_dec_op->s_disp_frm_buf.u4_u_strd =
+ ps_dec_op->s_disp_frm_buf.u4_y_strd / 2;
+ ps_dec_op->s_disp_frm_buf.u4_v_strd =
+ ps_dec_op->s_disp_frm_buf.u4_y_strd / 2;
+ ps_dec_op->s_disp_frm_buf.u4_u_wd =
+ ps_dec_op->s_disp_frm_buf.u4_y_wd / 2;
+ ps_dec_op->s_disp_frm_buf.u4_v_wd =
+ ps_dec_op->s_disp_frm_buf.u4_y_wd / 2;
+ ps_dec_op->s_disp_frm_buf.u4_u_ht =
+ ps_dec_op->s_disp_frm_buf.u4_y_ht / 2;
+ ps_dec_op->s_disp_frm_buf.u4_v_ht =
+ ps_dec_op->s_disp_frm_buf.u4_y_ht / 2;
+ }
+
+ }
+ else if(ps_codec->i4_flush_mode)
+ {
+ ps_dec_op->u4_error_code = IHEVCD_END_OF_SEQUENCE;
+ /* Come out of flush mode */
+ ps_codec->i4_flush_mode = 0;
+ }
+
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Codec process call
+ *
+ * @par Description:
+ * Codec process call Tests for few error checks Handle flush and decode
+ * header related code Parse bitstream for start codes For each NAL unit
+ * call decode NAL function Once a complete frame is decoded (in frame
+ * decode mode) Fill output arguments and return
+ *
+ * @param[in] ps_codec_obj
+ * Pointer to codec object at API level
+ *
+ * @param[in] pv_api_ip
+ * Pointer to input argument structure
+ *
+ * @param[in] pv_api_op
+ * Pointer to output argument structure
+ *
+ * @returns Status
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+WORD32 ihevcd_decode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op)
+{
+ WORD32 ret = IV_SUCCESS;
+ codec_t *ps_codec = (codec_t *)(ps_codec_obj->pv_codec_handle);
+ ivd_video_decode_ip_t *ps_dec_ip;
+ ivd_video_decode_op_t *ps_dec_op;
+
+ WORD32 proc_idx = 0;
+ WORD32 prev_proc_idx = 0;
+
+ /* Initialize error code */
+ ps_codec->i4_error_code = 0;
+
+ ps_dec_ip = (ivd_video_decode_ip_t *)pv_api_ip;
+ ps_dec_op = (ivd_video_decode_op_t *)pv_api_op;
+
+ memset(ps_dec_op, 0, sizeof(ivd_video_decode_op_t));
+ if(ps_codec->i4_init_done != 1)
+ {
+ ps_dec_op->u4_error_code |= 1 << IVD_FATALERROR;
+ ps_dec_op->u4_error_code |= IHEVCD_INIT_NOT_DONE;
+ return IV_FAIL;
+ }
+
+ if(ps_codec->u4_pic_cnt >= NUM_FRAMES_LIMIT)
+ {
+ ps_dec_op->u4_error_code |= 1 << IVD_FATALERROR;
+ ps_dec_op->u4_error_code |= IHEVCD_NUM_FRAMES_LIMIT_REACHED;
+ return IV_FAIL;
+ }
+
+ /* If reset flag is set, flush the existing buffers */
+ if(ps_codec->i4_reset_flag)
+ {
+ ps_codec->i4_flush_mode = 1;
+ }
+
+ /*Data memory barries instruction,so that bitstream write by the application is complete*/
+ //arm_dsb();
+ /* In case the decoder is not in flush mode check for input buffer validity */
+ if(0 == ps_codec->i4_flush_mode)
+ {
+ if(ps_dec_ip->pv_stream_buffer == NULL)
+ {
+ ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_dec_op->u4_error_code |= IVD_DEC_FRM_BS_BUF_NULL;
+ return IV_FAIL;
+ }
+ if(ps_dec_ip->u4_num_Bytes <= MIN_START_CODE_LEN)
+ {
+ if((WORD32)ps_dec_ip->u4_num_Bytes > 0)
+ ps_dec_op->u4_num_bytes_consumed = ps_dec_ip->u4_num_Bytes;
+ else
+ ps_dec_op->u4_num_bytes_consumed = 0;
+
+ ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_dec_op->u4_error_code |= IVD_DEC_NUMBYTES_INV;
+ return IV_FAIL;
+
+ }
+ }
+
+#ifdef APPLY_CONCEALMENT
+ {
+ WORD32 num_mbs;
+
+ num_mbs = (ps_codec->i4_wd * ps_codec->i4_ht + 255) >> 8;
+ /* Reset MB Count at the beginning of every process call */
+ ps_codec->mb_count = 0;
+ memset(ps_codec->mb_map, 0, ((num_mbs + 7) >> 3));
+ }
+#endif
+
+ if(0 == ps_codec->i4_share_disp_buf && ps_codec->i4_header_mode == 0)
+ {
+ UWORD32 i;
+ if(ps_dec_ip->s_out_buffer.u4_num_bufs == 0)
+ {
+ ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_dec_op->u4_error_code |= IVD_DISP_FRM_ZERO_OP_BUFS;
+ return IV_FAIL;
+ }
+
+ for(i = 0; i < ps_dec_ip->s_out_buffer.u4_num_bufs; i++)
+ {
+ if(ps_dec_ip->s_out_buffer.pu1_bufs[i] == NULL)
+ {
+ ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_dec_op->u4_error_code |= IVD_DISP_FRM_OP_BUF_NULL;
+ return IV_FAIL;
+ }
+
+ if(ps_dec_ip->s_out_buffer.u4_min_out_buf_size[i] == 0)
+ {
+ ps_dec_op->u4_error_code |= 1 << IVD_UNSUPPORTEDPARAM;
+ ps_dec_op->u4_error_code |= IVD_DISP_FRM_ZERO_OP_BUF_SIZE;
+ return IV_FAIL;
+ }
+ }
+ }
+
+ ps_codec->ps_out_buffer = &ps_dec_ip->s_out_buffer;
+ ps_codec->u4_ts = ps_dec_ip->u4_ts;
+ if(ps_codec->i4_flush_mode)
+ {
+
+ ps_dec_op->u4_pic_wd = ps_codec->i4_disp_wd;
+ ps_dec_op->u4_pic_ht = ps_codec->i4_disp_ht;
+
+ ps_dec_op->u4_new_seq = 0;
+
+ ps_codec->ps_disp_buf = (pic_buf_t *)ihevc_disp_mgr_get(
+ (disp_mgr_t *)ps_codec->pv_disp_buf_mgr, &ps_codec->i4_disp_buf_id);
+ /* In case of non-shared mode, then convert/copy the frame to output buffer */
+ /* Only if the codec is in non-shared mode or in shared mode but needs 420P output */
+ if((ps_codec->ps_disp_buf)
+ && ((0 == ps_codec->i4_share_disp_buf)
+ || (IV_YUV_420P
+ == ps_codec->e_chroma_fmt)))
+ {
+
+ process_ctxt_t *ps_proc = &ps_codec->as_process[prev_proc_idx];
+ if(0 == ps_proc->i4_init_done)
+ {
+ ihevcd_init_proc_ctxt(ps_proc, 0);
+ }
+
+ /* Set remaining number of rows to be processed */
+ ret = ihevcd_fmt_conv(ps_codec, &ps_codec->as_process[prev_proc_idx],
+ ps_dec_ip->s_out_buffer.pu1_bufs[0],
+ ps_dec_ip->s_out_buffer.pu1_bufs[1],
+ ps_dec_ip->s_out_buffer.pu1_bufs[2], 0,
+ ps_codec->i4_disp_ht);
+
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+ ps_codec->i4_disp_buf_id, BUF_MGR_DISP);
+ }
+
+ ihevcd_fill_outargs(ps_codec, ps_dec_ip, ps_dec_op);
+
+ if(1 == ps_dec_op->u4_output_present)
+ {
+ WORD32 xpos = ps_codec->i4_disp_wd - 32 - LOGO_WD;
+ WORD32 ypos = ps_codec->i4_disp_ht - 32 - LOGO_HT;
+
+ if(ypos < 0)
+ ypos = 0;
+
+ if(xpos < 0)
+ xpos = 0;
+
+ INSERT_LOGO(ps_dec_ip->s_out_buffer.pu1_bufs[0],
+ ps_dec_ip->s_out_buffer.pu1_bufs[1],
+ ps_dec_ip->s_out_buffer.pu1_bufs[2], ps_codec->i4_disp_strd,
+ xpos,
+ ypos,
+ ps_codec->e_chroma_fmt,
+ ps_codec->i4_disp_wd,
+ ps_codec->i4_disp_ht);
+ }
+
+
+ if(NULL == ps_codec->ps_disp_buf)
+ {
+ /* If in flush mode and there are no more buffers to flush,
+ * check for the reset flag and reset the decoder */
+ if(ps_codec->i4_reset_flag)
+ {
+ ihevcd_init(ps_codec);
+ }
+ return (IV_FAIL);
+ }
+
+ return (IV_SUCCESS);
+
+ }
+ /* In case of shared mode, check if there is a free buffer for reconstruction */
+ if((0 == ps_codec->i4_header_mode) && (1 == ps_codec->i4_share_disp_buf))
+ {
+ WORD32 buf_status;
+ buf_status = 1;
+ if(ps_codec->pv_pic_buf_mgr)
+ buf_status = ihevc_buf_mgr_check_free((buf_mgr_t *)ps_codec->pv_pic_buf_mgr);
+
+ /* If there is no free buffer, then return with an error code */
+ if(0 == buf_status)
+ {
+ ps_dec_op->u4_error_code = IVD_DEC_REF_BUF_NULL;
+ ps_dec_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM);
+ return IV_FAIL;
+ }
+ }
+ ps_codec->i4_bytes_remaining = ps_dec_ip->u4_num_Bytes;
+ ps_codec->pu1_inp_bitsbuf = (UWORD8 *)ps_dec_ip->pv_stream_buffer;
+ ps_codec->s_parse.i4_end_of_frame = 0;
+
+ ps_codec->i4_pic_present = 0;
+ ps_codec->i4_slice_error = 0;
+ ps_codec->ps_disp_buf = NULL;
+
+ if(ps_codec->i4_num_cores > 1)
+ {
+ ithread_set_affinity(0);
+ }
+ while(MIN_START_CODE_LEN < ps_codec->i4_bytes_remaining)
+ {
+ WORD32 nal_len;
+ WORD32 nal_ofst;
+ WORD32 bits_len;
+
+ if(ps_codec->i4_slice_error)
+ {
+ slice_header_t *ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+ WORD32 next_slice_addr = ps_slice_hdr_next->i2_ctb_x +
+ ps_slice_hdr_next->i2_ctb_y * ps_codec->s_parse.ps_sps->i2_pic_wd_in_ctb;
+ if(ps_codec->s_parse.i4_next_ctb_indx == next_slice_addr)
+ ps_codec->i4_slice_error = 0;
+ }
+
+ nal_ofst = ihevcd_nal_search_start_code(ps_codec->pu1_inp_bitsbuf,
+ ps_codec->i4_bytes_remaining);
+
+ ps_codec->i4_nal_ofst = nal_ofst;
+ {
+ WORD32 bytes_remaining = ps_codec->i4_bytes_remaining - nal_ofst;
+
+ bytes_remaining = MIN(bytes_remaining, ps_codec->u4_bitsbuf_size);
+ ihevcd_nal_remv_emuln_bytes(ps_codec->pu1_inp_bitsbuf + nal_ofst,
+ ps_codec->pu1_bitsbuf,
+ bytes_remaining,
+ &nal_len, &bits_len);
+ }
+ /* This may be used to update the offsets for tiles and entropy sync row offsets */
+ ps_codec->i4_num_emln_bytes = nal_len - bits_len;
+ ps_codec->i4_nal_len = nal_len;
+
+ ihevcd_bits_init(&ps_codec->s_parse.s_bitstrm, ps_codec->pu1_bitsbuf,
+ bits_len);
+
+ ret = ihevcd_nal_unit(ps_codec);
+
+ /* If the frame is incomplete and
+ * the bytes remaining is zero or a header is received,
+ * complete the frame treating it to be in error */
+ if(ps_codec->i4_pic_present &&
+ (ps_codec->s_parse.i4_next_ctb_indx != ps_codec->s_parse.ps_sps->i4_pic_size_in_ctb))
+ {
+ if((ps_codec->i4_bytes_remaining - (nal_len + nal_ofst) <= MIN_START_CODE_LEN) ||
+ (ps_codec->i4_header_in_slice_mode))
+ {
+ slice_header_t *ps_slice_hdr_next;
+
+ ps_codec->s_parse.i4_cur_slice_idx--;
+ if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+ ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+ ps_slice_hdr_next->i2_ctb_x = 0;
+ ps_slice_hdr_next->i2_ctb_y = ps_codec->s_parse.ps_sps->i2_pic_ht_in_ctb;
+ ps_codec->i4_slice_error = 1;
+ continue;
+ }
+ }
+
+ if(IHEVCD_IGNORE_SLICE == ret)
+ {
+ ps_codec->pu1_inp_bitsbuf += (nal_ofst + nal_len);
+ ps_codec->i4_bytes_remaining -= (nal_ofst + nal_len);
+
+ continue;
+ }
+
+ if((IHEVCD_FAIL == ret) &&
+ (ps_codec->i4_error_code == IVD_RES_CHANGED))
+ {
+ break;
+ }
+
+ /* Update bytes remaining and bytes consumed and input bitstream pointer */
+ /* Do not consume the NAL in the following cases */
+ /* Slice header reached during header decode mode */
+ /* TODO: Next picture's slice reached */
+ if(ret != IHEVCD_SLICE_IN_HEADER_MODE)
+ {
+ if((0 == ps_codec->i4_slice_error) ||
+ (ps_codec->i4_bytes_remaining - (nal_len + nal_ofst) <= MIN_START_CODE_LEN))
+ {
+ ps_codec->pu1_inp_bitsbuf += (nal_ofst + nal_len);
+ ps_codec->i4_bytes_remaining -= (nal_ofst + nal_len);
+ }
+ if(ret != IHEVCD_SUCCESS)
+ break;
+
+ if(ps_codec->s_parse.i4_end_of_frame)
+ break;
+ }
+ else
+ {
+ ret = IHEVCD_SUCCESS;
+ break;
+ }
+
+ BREAK_AFTER_SLICE_NAL();
+ }
+
+ if((ps_codec->u4_pic_cnt == 0) && (ret != IHEVCD_SUCCESS))
+ {
+ ps_codec->i4_error_code = ret;
+
+ ihevcd_fill_outargs(ps_codec, ps_dec_ip, ps_dec_op);
+ return IV_FAIL;
+ }
+
+ if(1 == ps_codec->i4_pic_present)
+ {
+ WORD32 i;
+ sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+ ps_codec->i4_first_pic_done = 1;
+
+ /*TODO temporary fix: end_of_frame is checked before adding format conversion to job queue */
+ if(ps_codec->i4_num_cores > 1 && ps_codec->s_parse.i4_end_of_frame)
+ {
+
+ /* Add job queue for format conversion / frame copy for each ctb row */
+ /* Only if the codec is in non-shared mode or in shared mode but needs 420P output */
+ process_ctxt_t *ps_proc;
+
+ /* i4_num_cores - 1 contexts are currently being used by other threads */
+ ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+
+ if((ps_codec->ps_disp_buf) &&
+ ((0 == ps_codec->i4_share_disp_buf) || (IV_YUV_420P == ps_codec->e_chroma_fmt)))
+ {
+ /* If format conversion jobs were not issued in pic_init() add them here */
+ if((0 == ps_codec->u4_enable_fmt_conv_ahead) ||
+ (ps_codec->i4_disp_buf_id == ps_proc->i4_cur_pic_buf_id))
+ for(i = 0; i < ps_sps->i2_pic_ht_in_ctb; i++)
+ {
+ proc_job_t s_job;
+ IHEVCD_ERROR_T ret;
+ s_job.i4_cmd = CMD_FMTCONV;
+ s_job.i2_ctb_cnt = 0;
+ s_job.i2_ctb_x = 0;
+ s_job.i2_ctb_y = i;
+ s_job.i2_slice_idx = 0;
+ s_job.i4_tu_coeff_data_ofst = 0;
+ ret = ihevcd_jobq_queue((jobq_t *)ps_codec->s_parse.pv_proc_jobq,
+ &s_job, sizeof(proc_job_t), 1);
+ if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ return (WORD32)ret;
+ }
+ }
+ /* Reached end of frame : Signal terminate */
+ /* The terminate flag is checked only after all the jobs are dequeued */
+ ret = ihevcd_jobq_terminate((jobq_t *)ps_codec->s_parse.pv_proc_jobq);
+
+ while(1)
+ {
+ IHEVCD_ERROR_T ret;
+ proc_job_t s_job;
+ process_ctxt_t *ps_proc;
+
+ /* i4_num_cores - 1 contexts are currently being used by other threads */
+ ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+
+ ret = ihevcd_jobq_dequeue((jobq_t *)ps_proc->pv_proc_jobq, &s_job,
+ sizeof(proc_job_t), 1);
+ if((IHEVCD_ERROR_T)IHEVCD_SUCCESS != ret)
+ break;
+
+ ps_proc->i4_ctb_cnt = s_job.i2_ctb_cnt;
+ ps_proc->i4_ctb_x = s_job.i2_ctb_x;
+ ps_proc->i4_ctb_y = s_job.i2_ctb_y;
+ ps_proc->i4_cur_slice_idx = s_job.i2_slice_idx;
+
+ if(CMD_PROCESS == s_job.i4_cmd)
+ {
+ ihevcd_init_proc_ctxt(ps_proc, s_job.i4_tu_coeff_data_ofst);
+#ifdef GPU_BUILD
+ if(s_job.i2_wait)
+ {
+ ihevcd_gpu_mc_wait(ps_proc, s_job.i2_granularity_idx);
+ }
+
+#endif
+
+ ihevcd_process(ps_proc);
+ }
+ else if(CMD_FMTCONV == s_job.i4_cmd)
+ {
+ sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+ WORD32 num_rows = 1 << ps_sps->i1_log2_ctb_size;
+ if(0 == ps_proc->i4_init_done)
+ {
+ ihevcd_init_proc_ctxt(ps_proc, 0);
+ }
+
+ num_rows = MIN(num_rows, (ps_codec->i4_disp_ht - (s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size)));
+ if(num_rows < 0)
+ num_rows = 0;
+
+ ihevcd_fmt_conv(ps_codec, ps_proc,
+ ps_dec_ip->s_out_buffer.pu1_bufs[0],
+ ps_dec_ip->s_out_buffer.pu1_bufs[1],
+ ps_dec_ip->s_out_buffer.pu1_bufs[2],
+ s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size,
+ num_rows);
+ }
+ }
+ }
+ /* In case of non-shared mode and while running in single core mode, then convert/copy the frame to output buffer */
+ /* Only if the codec is in non-shared mode or in shared mode but needs 420P output */
+ else if((ps_codec->ps_disp_buf) && ((0 == ps_codec->i4_share_disp_buf) ||
+ (IV_YUV_420P == ps_codec->e_chroma_fmt)) &&
+ (ps_codec->s_parse.i4_end_of_frame))
+ {
+ process_ctxt_t *ps_proc = &ps_codec->as_process[proc_idx];
+ /* Set remaining number of rows to be processed */
+ ps_codec->s_fmt_conv.i4_num_rows = ps_codec->i4_disp_ht
+ - ps_codec->s_fmt_conv.i4_cur_row;
+ if(0 == ps_proc->i4_init_done)
+ {
+ ihevcd_init_proc_ctxt(ps_proc, 0);
+ }
+
+ if(ps_codec->s_fmt_conv.i4_num_rows < 0)
+ ps_codec->s_fmt_conv.i4_num_rows = 0;
+
+ ret = ihevcd_fmt_conv(ps_codec, ps_proc,
+ ps_dec_ip->s_out_buffer.pu1_bufs[0],
+ ps_dec_ip->s_out_buffer.pu1_bufs[1],
+ ps_dec_ip->s_out_buffer.pu1_bufs[2],
+ ps_codec->s_fmt_conv.i4_cur_row,
+ ps_codec->s_fmt_conv.i4_num_rows);
+ ps_codec->s_fmt_conv.i4_cur_row += ps_codec->s_fmt_conv.i4_num_rows;
+
+ }
+#ifdef GPU_BUILD
+ {
+ /*
+ * Add the buffer to the display buffer. Free mv buffer.
+ */
+ {
+
+ ihevc_disp_mgr_add(ps_codec->pv_disp_buf_mgr,
+ ps_codec->as_process[proc_idx].i4_cur_pic_buf_id,
+ ps_codec->as_process[proc_idx].ps_slice_hdr->i4_abs_pic_order_cnt,
+ ps_codec->as_process[proc_idx].ps_cur_pic);
+ }
+ ihevcd_free_ref_mv_buffers(ps_codec);
+ ihevcd_gpu_mc_pic_deinit(ps_codec);
+
+ }
+#endif
+
+
+ DEBUG_DUMP_MV_MAP(ps_codec);
+
+ /* Mark MV Buf as needed for reference */
+ ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_mv_buf_mgr,
+ ps_codec->as_process[proc_idx].i4_cur_mv_bank_buf_id,
+ BUF_MGR_REF);
+
+ /* Mark pic buf as needed for reference */
+ ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+ ps_codec->as_process[proc_idx].i4_cur_pic_buf_id,
+ BUF_MGR_REF);
+
+ /* Mark pic buf as needed for display */
+ ihevc_buf_mgr_set_status((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+ ps_codec->as_process[proc_idx].i4_cur_pic_buf_id,
+ BUF_MGR_DISP);
+
+ /* Insert the current picture as short term reference */
+ ihevc_dpb_mgr_insert_ref((dpb_mgr_t *)ps_codec->pv_dpb_mgr,
+ ps_codec->as_process[proc_idx].ps_cur_pic,
+ ps_codec->as_process[proc_idx].i4_cur_pic_buf_id);
+
+ /* If a frame was displayed (in non-shared mode), then release it from display manager */
+ if((0 == ps_codec->i4_share_disp_buf) && (ps_codec->ps_disp_buf))
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_pic_buf_mgr,
+ ps_codec->i4_disp_buf_id, BUF_MGR_DISP);
+
+ /* Wait for threads */
+ for(i = 0; i < (ps_codec->i4_num_cores - 1); i++)
+ {
+ if(ps_codec->ai4_process_thread_created[i])
+ {
+ ithread_join(ps_codec->apv_process_thread_handle[i], NULL);
+ ps_codec->ai4_process_thread_created[i] = 0;
+ }
+ }
+
+ DEBUG_VALIDATE_PADDED_REGION(&ps_codec->as_process[proc_idx]);
+ if(ps_codec->u4_pic_cnt > 0)
+ {
+ DEBUG_DUMP_PIC_PU(ps_codec);
+ }
+ DEBUG_DUMP_PIC_BUFFERS(ps_codec);
+
+ /* Increment the number of pictures decoded */
+ ps_codec->u4_pic_cnt++;
+ }
+ ihevcd_fill_outargs(ps_codec, ps_dec_ip, ps_dec_op);
+
+ if(1 == ps_dec_op->u4_output_present)
+ {
+ WORD32 xpos = ps_codec->i4_disp_wd - 32 - LOGO_WD;
+ WORD32 ypos = ps_codec->i4_disp_ht - 32 - LOGO_HT;
+
+ if(ypos < 0)
+ ypos = 0;
+
+ if(xpos < 0)
+ xpos = 0;
+
+ INSERT_LOGO(ps_dec_ip->s_out_buffer.pu1_bufs[0],
+ ps_dec_ip->s_out_buffer.pu1_bufs[1],
+ ps_dec_ip->s_out_buffer.pu1_bufs[2], ps_codec->i4_disp_strd,
+ xpos,
+ ypos,
+ ps_codec->e_chroma_fmt,
+ ps_codec->i4_disp_wd,
+ ps_codec->i4_disp_ht);
+ }
+
+
+ return ret;
+}
+
diff --git a/decoder/ihevcd_decode.h b/decoder/ihevcd_decode.h
new file mode 100644
index 0000000..dfe6d5f
--- /dev/null
+++ b/decoder/ihevcd_decode.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_nal.h
+*
+* @brief
+* Header for main decode function
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_DECODE_H_
+#define _IHEVCD_DECODE_H_
+
+WORD32 ihevcd_decode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op);
+
+#endif /* _IHEVCD_DECODE_H_ */
diff --git a/decoder/ihevcd_defs.h b/decoder/ihevcd_defs.h
new file mode 100644
index 0000000..dec341c
--- /dev/null
+++ b/decoder/ihevcd_defs.h
@@ -0,0 +1,481 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_defs.h
+*
+* @brief
+* Definitions used in the decoder
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_DEFS_H_
+#define _IHEVCD_DEFS_H_
+
+
+/*****************************************************************************/
+/* Width and height restrictions */
+/*****************************************************************************/
+/**
+ * Minimum width supported by codec
+ */
+#define MIN_WD 64
+
+/**
+ * Maximum width supported by codec
+ */
+
+#define MAX_WD 8448
+
+/**
+ * Minimum height supported by codec
+ */
+#define MIN_HT 64
+
+/**
+ * Maximum height supported by codec
+ */
+
+#define MAX_HT 4320
+
+/*****************************************************************************/
+/* Padding sizes */
+/*****************************************************************************/
+/**
+ * Padding used for top of the frame
+ */
+#define PAD_TOP 80
+
+/**
+ * Padding used for bottom of the frame
+ */
+#define PAD_BOT 80
+
+/**
+ * Padding used at left of the frame
+ */
+#define PAD_LEFT 80
+
+/**
+ * Padding used at right of the frame
+ */
+#define PAD_RIGHT 80
+/**
+ * Padding for width
+ */
+#define PAD_WD (PAD_LEFT + PAD_RIGHT)
+/**
+ * Padding for height
+ */
+#define PAD_HT (PAD_TOP + PAD_BOT)
+
+/*****************************************************************************/
+/* Number of frame restrictions */
+/*****************************************************************************/
+/**
+ * Maximum number of reference buffers in DPB manager
+ */
+#define MAX_REF_CNT 32
+
+/*****************************************************************************/
+/* Num cores releated defs */
+/*****************************************************************************/
+/**
+ * Maximum number of cores
+ */
+#define MAX_NUM_CORES 8
+
+/**
+ * Maximum number of threads for pixel processing
+ */
+#define MAX_PROCESS_THREADS MAX_NUM_CORES
+
+/*****************************************************************************/
+/* Profile and level restrictions */
+/*****************************************************************************/
+/**
+ * Max level supported by the codec
+ */
+#define MAX_LEVEL IHEVC_LEVEL_62
+/**
+ * Min level supported by the codec
+ */
+
+#define MIN_LEVEL IHEVC_LEVEL_10
+
+
+/**
+ * Maximum number of slice headers that are held in memory simultaneously
+ * For single core implementation only 1 slice header is enough.
+ * But for multi-core parsing thread needs to ensure that slice headers are
+ * stored till the last CB in a slice is decoded.
+ * Parsing thread has to wait till last CB of a slice is consumed before reusing
+ * overwriting the slice header
+ * MAX_SLICE_HDR_CNT is assumed to be a power of 2
+ */
+
+#define LOG2_MAX_SLICE_HDR_CNT 8
+#define MAX_SLICE_HDR_CNT (1 << LOG2_MAX_SLICE_HDR_CNT)
+
+/* Number of NOP instructions to wait before yielding in process thread */
+#define PROC_NOP_CNT (8 * 128)
+
+
+/** Max QP delta that can be signalled */
+#define TU_MAX_QP_DELTA_ABS 5
+
+/** Max QP delta context increment that can be used for CABAC context */
+#define CTXT_MAX_QP_DELTA_ABS 1
+
+/*
+ * Flag whether to perform ilf at frame level or CTB level
+ */
+#define FRAME_ILF_PAD 0
+
+#define MAX_NUM_CTBS_IN_FRAME (MAX_WD * MAX_HT / MIN_CTB_SIZE / MIN_CTB_SIZE)
+
+/* Maximum slice segments allowed per frame in Level 6.2 */
+#define MAX_SLICE_SEGMENTS_IN_FRAME 600
+
+#ifdef GPU_BUILD
+/**
+ * Buffer allocated for ps_tu is re-used after RESET_TU_BUF_NCTB
+ * Set this to MAX_NUM_CTBS_IN_FRAME to disabke reuse
+ * If built for GPU, always set to maximum.
+ */
+#define RESET_TU_BUF_NCTB MAX_NUM_CTBS_IN_FRAME
+#else
+/**
+ * Buffer allocated for ps_tu is re-used after RESET_TU_BUF_NCTB
+ * Set this to MAX_NUM_CTBS_IN_FRAME to disabke reuse
+ */
+#define RESET_TU_BUF_NCTB MAX_NUM_CTBS_IN_FRAME
+#endif
+/**
+ * Flag whether to shift the CTB for SAO
+ */
+#define SAO_PROCESS_SHIFT_CTB 1
+
+/**
+ * Minimum bistream buffer size
+ */
+#define MIN_BITSBUF_SIZE (1024 * 1024)
+/**
+ *****************************************************************************
+ * Macro to compute total size required to hold on set of scaling matrices
+ *****************************************************************************
+ */
+#define SCALING_MAT_SIZE(m_scaling_mat_size) \
+{ \
+ m_scaling_mat_size = 6 * TRANS_SIZE_4 * TRANS_SIZE_4; \
+ m_scaling_mat_size += 6 * TRANS_SIZE_8 * TRANS_SIZE_8; \
+ m_scaling_mat_size += 6 * TRANS_SIZE_16 * TRANS_SIZE_16; \
+ m_scaling_mat_size += 2 * TRANS_SIZE_32 * TRANS_SIZE_32; \
+}
+
+/** If num_cores is greater than MV_PRED_NUM_CORES_THRESHOLD, then mv pred and
+ boundary strength computation is done in process side instead of parse side */
+#define MV_PRED_NUM_CORES_THRESHOLD 4
+
+/**
+ ***************************************************************************
+ * Enum to hold various mem records being request
+ ****************************************************************************
+ */
+enum
+{
+ /**
+ * Codec Object at API level
+ */
+ MEM_REC_IV_OBJ,
+
+ /**
+ * Codec context
+ */
+ MEM_REC_CODEC,
+
+ /**
+ * Bitstream buffer which holds emulation prevention removed bytes
+ */
+ MEM_REC_BITSBUF,
+
+ /**
+ * Buffer to hold TU structures and coeff data
+ */
+ MEM_REC_TU_DATA,
+
+ /**
+ * Motion vector bank
+ */
+ MEM_REC_MVBANK,
+
+ /**
+ * Holds mem records passed to the codec.
+ */
+ MEM_REC_BACKUP,
+
+ /**
+ * Holds VPS
+ */
+ MEM_REC_VPS,
+
+ /**
+ * Holds SPS
+ */
+ MEM_REC_SPS,
+
+ /**
+ * Holds PPS
+ */
+ MEM_REC_PPS,
+
+ /**
+ * Holds Slice Headers
+ */
+ MEM_REC_SLICE_HDR,
+
+ /**
+ * Holds tile information such as start position, widths and heights
+ */
+ MEM_REC_TILE,
+
+ /**
+ * Holds entry point offsets for tiles and entropy sync points
+ */
+ MEM_REC_ENTRY_OFST,
+
+ /**
+ * Holds scaling matrices
+ */
+ MEM_REC_SCALING_MAT,
+
+ /**
+ * Holds one row skip_flag at 8x8 level used during parsing
+ */
+ MEM_REC_PARSE_SKIP_FLAG,
+
+ /**
+ * Holds one row ctb_tree_depth at 8x8 level used during parsing
+ */
+ MEM_REC_PARSE_CT_DEPTH,
+
+ /**
+ * Holds one row luma intra pred mode at 8x8 level used during parsing
+ */
+ MEM_REC_PARSE_INTRA_PRED_MODE,
+
+ /**
+ * Holds intra flag at 8x8 level for entire frame
+ * This is kept at frame level so that processing thread also can use this
+ * data during intra prediction and compute BS
+ */
+ MEM_REC_INTRA_FLAG,
+
+ /**
+ * Holds transquant bypass flag at 8x8 level for entire frame
+ * This is kept at frame level so that processing thread also can use this
+ */
+ MEM_REC_TRANSQUANT_BYPASS_FLAG,
+
+ /**
+ * Holds thread handles
+ */
+ MEM_REC_THREAD_HANDLE,
+
+ /**
+ * Holds memory for Process JOB Queue
+ */
+ MEM_REC_PROC_JOBQ,
+
+ /**
+ * Contains status map indicating parse status per CTB basis
+ */
+ MEM_REC_PARSE_MAP,
+
+ /**
+ * Contains status map indicating processing status per CTB basis
+ */
+ MEM_REC_PROC_MAP,
+
+ /**
+ * Holds display buffer manager context
+ */
+ MEM_REC_DISP_MGR,
+
+ /**
+ * Holds dpb manager context
+ */
+ MEM_REC_DPB_MGR,
+
+ /**
+ * Holds top and left neighbors' pu_idx array w.r.t picture level pu array
+ */
+ MEM_REC_PIC_PU_IDX_NEIGHBOR,
+
+ /**
+ * Holds intermediate buffers needed during processing stage
+ * Memory for process contexts is allocated in this memtab
+ */
+ MEM_REC_PROC_SCRATCH,
+
+ /**
+ * Holds intermediate buffers needed during SAO processing
+ */
+ MEM_REC_SAO_SCRATCH,
+
+ /**
+ * Holds buffers for vert_bs, horz_bs and QP (all frame level)
+ */
+ MEM_REC_BS_QP,
+
+ /**
+ * Contains slice map indicatating the slice index for each CTB
+ */
+ MEM_REC_TILE_IDX,
+
+ /**
+ * Holds buffers for array of SAO structures
+ */
+ MEM_REC_SAO,
+
+#ifdef GPU_BUILD
+ /**
+ * Holds buffer GPU context
+ */
+ MEM_REC_GPU,
+#endif
+ /**
+ * Holds picture buffer manager context and array of pic_buf_ts
+ * Also holds reference picture buffers in non-shared mode
+ */
+ MEM_REC_REF_PIC,
+
+
+
+ /**
+ * Place holder to compute number of memory records.
+ */
+ MEM_REC_CNT
+ /* Do not add anything below */
+};
+
+
+
+#define DISABLE_DEBLOCK_INTERVAL 8
+#define DISABLE_SAO_INTERVAL 8
+
+/**
+ ****************************************************************************
+ * Disable deblock levels
+ * Level 0 enables deblocking completely and level 4 disables completely
+ * Other levels are intermediate values to control deblocking level
+ ****************************************************************************
+ */
+enum
+{
+ /**
+ * Enable deblocking completely
+ */
+ DISABLE_DEBLK_LEVEL_0,
+ /**
+ * Disable only within CTB edges - Not supported currently
+ */
+ DISABLE_DEBLK_LEVEL_1,
+
+ /**
+ * Enable deblocking once in DEBLOCK_INTERVAL number of pictures
+ * and for I slices
+ */
+ DISABLE_DEBLK_LEVEL_2,
+
+ /**
+ * Enable deblocking only for I slices
+ */
+ DISABLE_DEBLK_LEVEL_3,
+
+ /**
+ * Disable deblocking completely
+ */
+ DISABLE_DEBLK_LEVEL_4
+};
+
+enum
+{
+ /**
+ * Enable deblocking completely
+ */
+ DISABLE_SAO_LEVEL_0,
+ /**
+ * Disable only within CTB edges - Not supported currently
+ */
+ DISABLE_SAO_LEVEL_1,
+
+ /**
+ * Enable deblocking once in DEBLOCK_INTERVAL number of pictures
+ * and for I slices
+ */
+ DISABLE_SAO_LEVEL_2,
+
+ /**
+ * Enable deblocking only for I slices
+ */
+ DISABLE_SAO_LEVEL_3,
+
+ /**
+ * Disable deblocking completely
+ */
+ DISABLE_SAO_LEVEL_4
+};
+
+/**
+ ****************************************************************************
+ * Number of buffers for I/O based on format
+ ****************************************************************************
+ */
+#define MIN_IN_BUFS 1
+#define MIN_OUT_BUFS_420 3
+#define MIN_OUT_BUFS_422ILE 1
+#define MIN_OUT_BUFS_RGB565 1
+#define MIN_OUT_BUFS_RGBA8888 1
+#define MIN_OUT_BUFS_420SP 2
+
+/**
+ ****************************************************************************
+ * Definitions related to MV pred mv merge
+ ****************************************************************************
+ */
+#define MAX_NUM_MERGE_CAND 5
+
+#define MAX_NUM_MV_NBR 5
+
+#define MAX_MVP_LIST_CAND 2
+#define MAX_MVP_LIST_CAND_MEM (MAX_MVP_LIST_CAND + 1)
+
+
+
+#endif /*_IHEVCD_DEFS_H_*/
diff --git a/decoder/ihevcd_error.h b/decoder/ihevcd_error.h
new file mode 100644
index 0000000..7d2b255
--- /dev/null
+++ b/decoder/ihevcd_error.h
@@ -0,0 +1,127 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_error.h
+*
+* @brief
+* Definitions related to error handling
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_ERROR_H_
+#define _IHEVCD_ERROR_H_
+
+/**
+ * Enumerations for error codes used in the codec.
+ * Not all these are expected to be returned to the application.
+ * Only select few will be exported
+ */
+typedef enum
+{
+ /**
+ * VPS id more than MAX_VPS_CNT
+ */
+ IHEVCD_UNSUPPORTED_VPS_ID = IVD_DUMMY_ELEMENT_FOR_CODEC_EXTENSIONS + 0x300,
+ /**
+ * SPS id more than MAX_SPS_CNT
+ */
+
+ IHEVCD_UNSUPPORTED_SPS_ID,
+ /**
+ * PPS id more than MAX_PPS_CNT
+ */
+
+ IHEVCD_UNSUPPORTED_PPS_ID,
+
+ /**
+ * Invelid Parameter while decoding
+ */
+ IHEVCD_INVALID_PARAMETER,
+
+ /**
+ * Invalid header
+ */
+ IHEVCD_INVALID_HEADER,
+
+ /**
+ * In sufficient memory allocated for MV Bank
+ */
+ IHEVCD_INSUFFICIENT_MEM_MVBANK,
+
+ /**
+ * In sufficient memory allocated for MV Bank
+ */
+ IHEVCD_INSUFFICIENT_MEM_PICBUF,
+
+ /**
+ * Buffer manager error
+ */
+ IHEVCD_BUF_MGR_ERROR,
+
+ /**
+ * No free MV Bank buffer available to store current pic
+ */
+ IHEVCD_NO_FREE_MVBANK,
+
+ /**
+ * No free picture buffer available to store current pic
+ */
+ IHEVCD_NO_FREE_PICBUF,
+ /**
+ * Reached slice header in header mode
+ */
+ IHEVCD_SLICE_IN_HEADER_MODE,
+
+ /**
+ * Ignore current slice and continue
+ */
+ IHEVCD_IGNORE_SLICE,
+
+ /**
+ * Reference Picture not found
+ */
+ IHEVCD_REF_PIC_NOT_FOUND,
+
+ /**
+ * Bit depth is greater than 8
+ */
+ IHEVCD_UNSUPPORTED_BIT_DEPTH,
+
+ /**
+ * Limit on the number of frames decoded
+ */
+ IHEVCD_NUM_FRAMES_LIMIT_REACHED,
+
+ /**
+ * VUI parameters not found
+ */
+ IHEVCD_VUI_PARAMS_NOT_FOUND,
+
+}IHEVCD_ERROR_T;
+#endif /* _IHEVCD_ERROR_H_ */
diff --git a/decoder/ihevcd_fmt_conv.c b/decoder/ihevcd_fmt_conv.c
new file mode 100644
index 0000000..df62355
--- /dev/null
+++ b/decoder/ihevcd_fmt_conv.c
@@ -0,0 +1,909 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_fmt_conv.c
+*
+* @brief
+* Contains functions for format conversion or frame copy of output buffer
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_profile.h"
+
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+* Input Y pointer
+*
+* @param[in] pu1_uv_src
+* Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+* Output Y pointer
+*
+* @param[in] pu1_uv_dst
+* Output UV pointer (UV is interleaved in the same format as that of input)
+*
+* @param[in] wd
+* Width
+*
+* @param[in] ht
+* Height
+*
+* @param[in] src_y_strd
+* Input Y Stride
+*
+* @param[in] src_uv_strd
+* Input UV stride
+*
+* @param[in] dst_y_strd
+* Output Y stride
+*
+* @param[in] dst_uv_strd
+* Output UV stride
+*
+* @returns None
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+void ihevcd_fmt_conv_420sp_to_rgb565(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD16 *pu2_rgb_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_strd,
+ WORD32 is_u_first)
+{
+
+
+ WORD16 i2_r, i2_g, i2_b;
+ UWORD32 u4_r, u4_g, u4_b;
+ WORD16 i2_i, i2_j;
+ UWORD8 *pu1_y_src_nxt;
+ UWORD16 *pu2_rgb_dst_NextRow;
+
+ UWORD8 *pu1_u_src, *pu1_v_src;
+
+ if(is_u_first)
+ {
+ pu1_u_src = (UWORD8 *)pu1_uv_src;
+ pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+ }
+ else
+ {
+ pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+ pu1_v_src = (UWORD8 *)pu1_uv_src;
+ }
+
+ pu1_y_src_nxt = pu1_y_src + src_y_strd;
+ pu2_rgb_dst_NextRow = pu2_rgb_dst + dst_strd;
+
+ for(i2_i = 0; i2_i < (ht >> 1); i2_i++)
+ {
+ for(i2_j = (wd >> 1); i2_j > 0; i2_j--)
+ {
+ i2_b = ((*pu1_u_src - 128) * COEFF4 >> 13);
+ i2_g = ((*pu1_u_src - 128) * COEFF2 + (*pu1_v_src - 128) * COEFF3) >> 13;
+ i2_r = ((*pu1_v_src - 128) * COEFF1) >> 13;
+
+ pu1_u_src += 2;
+ pu1_v_src += 2;
+ /* pixel 0 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src + i2_b);
+ u4_b >>= 3;
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src + i2_g);
+ u4_g >>= 2;
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src + i2_r);
+ u4_r >>= 3;
+
+ pu1_y_src++;
+ *pu2_rgb_dst++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+ /* pixel 1 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src + i2_b);
+ u4_b >>= 3;
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src + i2_g);
+ u4_g >>= 2;
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src + i2_r);
+ u4_r >>= 3;
+
+ pu1_y_src++;
+ *pu2_rgb_dst++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+ /* pixel 2 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+ u4_b >>= 3;
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+ u4_g >>= 2;
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+ u4_r >>= 3;
+
+ pu1_y_src_nxt++;
+ *pu2_rgb_dst_NextRow++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+ /* pixel 3 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+ u4_b >>= 3;
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+ u4_g >>= 2;
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+ u4_r >>= 3;
+
+ pu1_y_src_nxt++;
+ *pu2_rgb_dst_NextRow++ = ((u4_r << 11) | (u4_g << 5) | u4_b);
+
+ }
+
+ pu1_u_src = pu1_u_src + src_uv_strd - wd;
+ pu1_v_src = pu1_v_src + src_uv_strd - wd;
+
+ pu1_y_src = pu1_y_src + (src_y_strd << 1) - wd;
+ pu1_y_src_nxt = pu1_y_src_nxt + (src_y_strd << 1) - wd;
+
+ pu2_rgb_dst = pu2_rgb_dst_NextRow - wd + dst_strd;
+ pu2_rgb_dst_NextRow = pu2_rgb_dst_NextRow + (dst_strd << 1) - wd;
+ }
+
+
+}
+
+void ihevcd_fmt_conv_420sp_to_rgba8888(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD32 *pu4_rgba_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_strd,
+ WORD32 is_u_first)
+{
+
+
+ WORD16 i2_r, i2_g, i2_b;
+ UWORD32 u4_r, u4_g, u4_b;
+ WORD16 i2_i, i2_j;
+ UWORD8 *pu1_y_src_nxt;
+ UWORD32 *pu4_rgba_dst_NextRow;
+
+ UWORD8 *pu1_u_src, *pu1_v_src;
+
+ if(is_u_first)
+ {
+ pu1_u_src = (UWORD8 *)pu1_uv_src;
+ pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+ }
+ else
+ {
+ pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+ pu1_v_src = (UWORD8 *)pu1_uv_src;
+ }
+
+ pu1_y_src_nxt = pu1_y_src + src_y_strd;
+ pu4_rgba_dst_NextRow = pu4_rgba_dst + dst_strd;
+
+ for(i2_i = 0; i2_i < (ht >> 1); i2_i++)
+ {
+ for(i2_j = (wd >> 1); i2_j > 0; i2_j--)
+ {
+ i2_b = ((*pu1_u_src - 128) * COEFF4 >> 13);
+ i2_g = ((*pu1_u_src - 128) * COEFF2 + (*pu1_v_src - 128) * COEFF3) >> 13;
+ i2_r = ((*pu1_v_src - 128) * COEFF1) >> 13;
+
+ pu1_u_src += 2;
+ pu1_v_src += 2;
+ /* pixel 0 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src + i2_b);
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src + i2_g);
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src + i2_r);
+
+ pu1_y_src++;
+ *pu4_rgba_dst++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+ /* pixel 1 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src + i2_b);
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src + i2_g);
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src + i2_r);
+
+ pu1_y_src++;
+ *pu4_rgba_dst++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+ /* pixel 2 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+
+ pu1_y_src_nxt++;
+ *pu4_rgba_dst_NextRow++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+ /* pixel 3 */
+ /* B */
+ u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b);
+ /* G */
+ u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g);
+ /* R */
+ u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r);
+
+ pu1_y_src_nxt++;
+ *pu4_rgba_dst_NextRow++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0));
+
+ }
+
+ pu1_u_src = pu1_u_src + src_uv_strd - wd;
+ pu1_v_src = pu1_v_src + src_uv_strd - wd;
+
+ pu1_y_src = pu1_y_src + (src_y_strd << 1) - wd;
+ pu1_y_src_nxt = pu1_y_src_nxt + (src_y_strd << 1) - wd;
+
+ pu4_rgba_dst = pu4_rgba_dst_NextRow - wd + dst_strd;
+ pu4_rgba_dst_NextRow = pu4_rgba_dst_NextRow + (dst_strd << 1) - wd;
+ }
+
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+* Input Y pointer
+*
+* @param[in] pu1_uv_src
+* Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+* Output Y pointer
+*
+* @param[in] pu1_uv_dst
+* Output UV pointer (UV is interleaved in the same format as that of input)
+*
+* @param[in] wd
+* Width
+*
+* @param[in] ht
+* Height
+*
+* @param[in] src_y_strd
+* Input Y Stride
+*
+* @param[in] src_uv_strd
+* Input UV stride
+*
+* @param[in] dst_y_strd
+* Output Y stride
+*
+* @param[in] dst_uv_strd
+* Output UV stride
+*
+* @returns None
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+
+void ihevcd_fmt_conv_420sp_to_420sp(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_uv_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_y_strd,
+ WORD32 dst_uv_strd)
+{
+ UWORD8 *pu1_src, *pu1_dst;
+ WORD32 num_rows, num_cols, src_strd, dst_strd;
+ WORD32 i;
+
+ /* copy luma */
+ pu1_src = (UWORD8 *)pu1_y_src;
+ pu1_dst = (UWORD8 *)pu1_y_dst;
+
+ num_rows = ht;
+ num_cols = wd;
+
+ src_strd = src_y_strd;
+ dst_strd = dst_y_strd;
+
+ for(i = 0; i < num_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, num_cols);
+ pu1_dst += dst_strd;
+ pu1_src += src_strd;
+ }
+
+ /* copy U and V */
+ pu1_src = (UWORD8 *)pu1_uv_src;
+ pu1_dst = (UWORD8 *)pu1_uv_dst;
+
+ num_rows = ht >> 1;
+ num_cols = wd;
+
+ src_strd = src_uv_strd;
+ dst_strd = dst_uv_strd;
+
+ for(i = 0; i < num_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, num_cols);
+ pu1_dst += dst_strd;
+ pu1_src += src_strd;
+ }
+ return;
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+* Input Y pointer
+*
+* @param[in] pu1_uv_src
+* Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+* Output Y pointer
+*
+* @param[in] pu1_uv_dst
+* Output UV pointer (UV is interleaved in the same format as that of input)
+*
+* @param[in] wd
+* Width
+*
+* @param[in] ht
+* Height
+*
+* @param[in] src_y_strd
+* Input Y Stride
+*
+* @param[in] src_uv_strd
+* Input UV stride
+*
+* @param[in] dst_y_strd
+* Output Y stride
+*
+* @param[in] dst_uv_strd
+* Output UV stride
+*
+* @returns None
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+void ihevcd_fmt_conv_420sp_to_420sp_swap_uv(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_uv_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_y_strd,
+ WORD32 dst_uv_strd)
+{
+ UWORD8 *pu1_src, *pu1_dst;
+ WORD32 num_rows, num_cols, src_strd, dst_strd;
+ WORD32 i;
+
+ /* copy luma */
+ pu1_src = (UWORD8 *)pu1_y_src;
+ pu1_dst = (UWORD8 *)pu1_y_dst;
+
+ num_rows = ht;
+ num_cols = wd;
+
+ src_strd = src_y_strd;
+ dst_strd = dst_y_strd;
+
+ for(i = 0; i < num_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, num_cols);
+ pu1_dst += dst_strd;
+ pu1_src += src_strd;
+ }
+
+ /* copy U and V */
+ pu1_src = (UWORD8 *)pu1_uv_src;
+ pu1_dst = (UWORD8 *)pu1_uv_dst;
+
+ num_rows = ht >> 1;
+ num_cols = wd;
+
+ src_strd = src_uv_strd;
+ dst_strd = dst_uv_strd;
+
+ for(i = 0; i < num_rows; i++)
+ {
+ WORD32 j;
+ for(j = 0; j < num_cols; j += 2)
+ {
+ pu1_dst[j + 0] = pu1_src[j + 1];
+ pu1_dst[j + 1] = pu1_src[j + 0];
+ }
+ pu1_dst += dst_strd;
+ pu1_src += src_strd;
+ }
+ return;
+}
+/**
+*******************************************************************************
+*
+* @brief Function used from copying a 420SP buffer
+*
+* @par Description
+* Function used from copying a 420SP buffer
+*
+* @param[in] pu1_y_src
+* Input Y pointer
+*
+* @param[in] pu1_uv_src
+* Input UV pointer (UV is interleaved either in UV or VU format)
+*
+* @param[in] pu1_y_dst
+* Output Y pointer
+*
+* @param[in] pu1_u_dst
+* Output U pointer
+*
+* @param[in] pu1_v_dst
+* Output V pointer
+*
+* @param[in] wd
+* Width
+*
+* @param[in] ht
+* Height
+*
+* @param[in] src_y_strd
+* Input Y Stride
+*
+* @param[in] src_uv_strd
+* Input UV stride
+*
+* @param[in] dst_y_strd
+* Output Y stride
+*
+* @param[in] dst_uv_strd
+* Output UV stride
+*
+* @param[in] is_u_first
+* Flag to indicate if U is the first byte in input chroma part
+*
+* @returns none
+*
+* @remarks In case there is a need to perform partial frame copy then
+* by passion appropriate source and destination pointers and appropriate
+* values for wd and ht it can be done
+*
+*******************************************************************************
+*/
+
+
+void ihevcd_fmt_conv_420sp_to_420p(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_u_dst,
+ UWORD8 *pu1_v_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_y_strd,
+ WORD32 dst_uv_strd,
+ WORD32 is_u_first,
+ WORD32 disable_luma_copy)
+{
+ UWORD8 *pu1_src, *pu1_dst;
+ UWORD8 *pu1_u_src, *pu1_v_src;
+ WORD32 num_rows, num_cols, src_strd, dst_strd;
+ WORD32 i, j;
+
+ if(0 == disable_luma_copy)
+ {
+ /* copy luma */
+ pu1_src = (UWORD8 *)pu1_y_src;
+ pu1_dst = (UWORD8 *)pu1_y_dst;
+
+ num_rows = ht;
+ num_cols = wd;
+
+ src_strd = src_y_strd;
+ dst_strd = dst_y_strd;
+
+ for(i = 0; i < num_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, num_cols);
+ pu1_dst += dst_strd;
+ pu1_src += src_strd;
+ }
+ }
+ /* de-interleave U and V and copy to destination */
+ if(is_u_first)
+ {
+ pu1_u_src = (UWORD8 *)pu1_uv_src;
+ pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+ }
+ else
+ {
+ pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+ pu1_v_src = (UWORD8 *)pu1_uv_src;
+ }
+
+
+ num_rows = ht >> 1;
+ num_cols = wd >> 1;
+
+ src_strd = src_uv_strd;
+ dst_strd = dst_uv_strd;
+
+ for(i = 0; i < num_rows; i++)
+ {
+ for(j = 0; j < num_cols; j++)
+ {
+ pu1_u_dst[j] = pu1_u_src[j * 2];
+ pu1_v_dst[j] = pu1_v_src[j * 2];
+ }
+
+ pu1_u_dst += dst_strd;
+ pu1_v_dst += dst_strd;
+ pu1_u_src += src_strd;
+ pu1_v_src += src_strd;
+ }
+ return;
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief Function used from format conversion or frame copy
+*
+* @par Description
+* Function used from copying or converting a reference frame to display buffer
+* in non shared mode
+*
+* @param[in] pu1_y_dst
+* Output Y pointer
+*
+* @param[in] pu1_u_dst
+* Output U/UV pointer ( UV is interleaved in the same format as that of input)
+*
+* @param[in] pu1_v_dst
+* Output V pointer ( used in 420P output case)
+*
+* @param[in] blocking
+* To indicate whether format conversion should wait till frame is reconstructed
+* and then return after complete copy is done. To be set to 1 when called at the
+* end of frame processing and set to 0 when called between frame processing modules
+* in order to utilize available MCPS
+*
+* @returns Error from IHEVCD_ERROR_T
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_fmt_conv(codec_t *ps_codec,
+ process_ctxt_t *ps_proc,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_u_dst,
+ UWORD8 *pu1_v_dst,
+ WORD32 cur_row,
+ WORD32 num_rows)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ pic_buf_t *ps_disp_pic;
+ UWORD8 *pu1_y_src, *pu1_uv_src;
+ UWORD8 *pu1_y_dst_tmp, *pu1_uv_dst_tmp;
+ UWORD8 *pu1_u_dst_tmp, *pu1_v_dst_tmp;
+ UWORD16 *pu2_rgb_dst_tmp;
+ UWORD32 *pu4_rgb_dst_tmp;
+ WORD32 is_u_first;
+ UWORD8 *pu1_luma;
+ UWORD8 *pu1_chroma;
+ sps_t *ps_sps;
+ WORD32 disable_luma_copy;
+ WORD32 crop_unit_x, crop_unit_y;
+
+ if(0 == num_rows)
+ return ret;
+
+ /* In case processing is disabled, then no need to format convert/copy */
+ PROFILE_DISABLE_FMT_CONV();
+ ps_sps = ps_proc->ps_sps;
+
+ crop_unit_x = 1;
+ crop_unit_y = 1;
+
+ if(CHROMA_FMT_IDC_YUV420 == ps_sps->i1_chroma_format_idc)
+ {
+ crop_unit_x = 2;
+ crop_unit_y = 2;
+ }
+
+ ps_disp_pic = ps_codec->ps_disp_buf;
+ pu1_luma = ps_disp_pic->pu1_luma;
+ pu1_chroma = ps_disp_pic->pu1_chroma;
+
+
+ /* Take care of cropping */
+ pu1_luma += ps_codec->i4_strd * ps_sps->i2_pic_crop_top_offset * crop_unit_y + ps_sps->i2_pic_crop_left_offset * crop_unit_x;
+
+ /* Left offset is multiplied by 2 because buffer is UV interleaved */
+ pu1_chroma += ps_codec->i4_strd * ps_sps->i2_pic_crop_top_offset + ps_sps->i2_pic_crop_left_offset * 2;
+
+
+ is_u_first = (IV_YUV_420SP_UV == ps_codec->e_ref_chroma_fmt) ? 1 : 0;
+
+ /* In case of 420P output luma copy is disabled for shared mode */
+ disable_luma_copy = 0;
+ if(1 == ps_codec->i4_share_disp_buf)
+ {
+ disable_luma_copy = 1;
+ }
+
+
+
+ {
+ pu1_y_src = pu1_luma + cur_row * ps_codec->i4_strd;
+ pu1_uv_src = pu1_chroma + (cur_row / 2) * ps_codec->i4_strd;
+
+ pu2_rgb_dst_tmp = (UWORD16 *)pu1_y_dst;
+ pu2_rgb_dst_tmp += cur_row * ps_codec->i4_disp_strd;
+ pu4_rgb_dst_tmp = (UWORD32 *)pu1_y_dst;
+ pu4_rgb_dst_tmp += cur_row * ps_codec->i4_disp_strd;
+ pu1_y_dst_tmp = pu1_y_dst + cur_row * ps_codec->i4_disp_strd;
+ pu1_uv_dst_tmp = pu1_u_dst + (cur_row / 2) * ps_codec->i4_disp_strd;
+ pu1_u_dst_tmp = pu1_u_dst + (cur_row / 2) * ps_codec->i4_disp_strd / 2;
+ pu1_v_dst_tmp = pu1_v_dst + (cur_row / 2) * ps_codec->i4_disp_strd / 2;
+
+ /* In case of multi threaded implementation, format conversion might be called
+ * before reconstruction is completed. If the frame being converted/copied
+ * is same as the frame being reconstructed,
+ * Check how many rows can be format converted
+ * Convert those many rows and then check for remaining rows and so on
+ */
+
+ if((0 == ps_codec->i4_flush_mode) && (ps_codec->i4_disp_buf_id == ps_proc->i4_cur_pic_buf_id) && (1 < ps_codec->i4_num_cores))
+ {
+ WORD32 idx;
+ UWORD8 *pu1_buf;
+ WORD32 status;
+ WORD32 last_row = cur_row + num_rows;
+ WORD32 last_ctb_y;
+ UWORD32 ctb_in_row;
+
+ while(1)
+ {
+ last_row = cur_row + MAX(num_rows, (1 << ps_sps->i1_log2_ctb_size)) +
+ ps_sps->i2_pic_crop_top_offset * crop_unit_y;
+ last_ctb_y = (last_row >> ps_sps->i1_log2_ctb_size) - 1;
+ /* Since deblocking works with a shift of -4, -4 ,wait till next CTB row is processed */
+ last_ctb_y++;
+ /* In case of a conformance window, an extra wait of one row might be needed */
+ last_ctb_y++;
+ last_ctb_y = MIN(last_ctb_y, (ps_sps->i2_pic_ht_in_ctb - 1));
+
+ idx = (last_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+
+ /*Check if the row below is completely processed before proceeding with format conversion*/
+ status = 1;
+ for(ctb_in_row = 0; (WORD32)ctb_in_row < ps_sps->i2_pic_wd_in_ctb; ctb_in_row++)
+ {
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+ pu1_buf = (ps_codec->pu1_proc_map + idx + ctb_in_row);
+#endif
+ status &= *pu1_buf;
+ }
+
+ if(status)
+ {
+ break;
+ }
+ else
+ {
+ ithread_yield();
+ }
+ }
+ }
+
+
+ if((IV_YUV_420SP_UV == ps_codec->e_chroma_fmt) || (IV_YUV_420SP_VU == ps_codec->e_chroma_fmt))
+ {
+
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr(pu1_y_src, pu1_uv_src,
+ pu1_y_dst_tmp, pu1_uv_dst_tmp,
+ ps_codec->i4_disp_wd,
+ num_rows,
+ ps_codec->i4_strd,
+ ps_codec->i4_strd,
+ ps_codec->i4_disp_strd,
+ ps_codec->i4_disp_strd);
+ }
+#if 0
+ else if(IV_YUV_420SP_VU == ps_codec->e_chroma_fmt)
+ {
+
+ ihevcd_fmt_conv_420sp_to_420sp_swap_uv(pu1_y_src, pu1_uv_src,
+ pu1_y_dst_tmp, pu1_uv_dst_tmp,
+ ps_codec->i4_disp_wd,
+ num_rows,
+ ps_codec->i4_strd,
+ ps_codec->i4_strd,
+ ps_codec->i4_disp_strd,
+ ps_codec->i4_disp_strd);
+ }
+#endif
+ else if(IV_YUV_420P == ps_codec->e_chroma_fmt)
+ {
+
+ if(0 == disable_luma_copy)
+ {
+ // copy luma
+ WORD32 i;
+ WORD32 num_cols = ps_codec->i4_disp_wd;
+
+ for(i = 0; i < num_rows; i++)
+ {
+ memcpy(pu1_y_dst_tmp, pu1_y_src, num_cols);
+ pu1_y_dst_tmp += ps_codec->i4_disp_strd;
+ pu1_y_src += ps_codec->i4_strd;
+ }
+
+ disable_luma_copy = 1;
+ }
+
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr(pu1_y_src, pu1_uv_src,
+ pu1_y_dst_tmp, pu1_u_dst_tmp, pu1_v_dst_tmp,
+ ps_codec->i4_disp_wd,
+ num_rows,
+ ps_codec->i4_strd,
+ ps_codec->i4_strd,
+ ps_codec->i4_disp_strd,
+ (ps_codec->i4_disp_strd / 2),
+ is_u_first,
+ disable_luma_copy);
+
+ }
+ else if(IV_RGB_565 == ps_codec->e_chroma_fmt)
+ {
+
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr(pu1_y_src, pu1_uv_src,
+ pu2_rgb_dst_tmp,
+ ps_codec->i4_disp_wd,
+ num_rows,
+ ps_codec->i4_strd,
+ ps_codec->i4_strd,
+ ps_codec->i4_disp_strd,
+ is_u_first);
+
+ }
+ else if(IV_RGBA_8888 == ps_codec->e_chroma_fmt)
+ {
+ ASSERT(is_u_first == 1);
+
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr(pu1_y_src,
+ pu1_uv_src,
+ pu4_rgb_dst_tmp,
+ ps_codec->i4_disp_wd,
+ num_rows,
+ ps_codec->i4_strd,
+ ps_codec->i4_strd,
+ ps_codec->i4_disp_strd,
+ is_u_first);
+
+ }
+
+
+
+ }
+ return (ret);
+}
+
diff --git a/decoder/ihevcd_fmt_conv.h b/decoder/ihevcd_fmt_conv.h
new file mode 100644
index 0000000..e099218
--- /dev/null
+++ b/decoder/ihevcd_fmt_conv.h
@@ -0,0 +1,118 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_structs.h
+ *
+ * @brief
+ * Structure definitions used in the decoder
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_FMT_CONV_H_
+#define _IHEVCD_FMT_CONV_H_
+
+#define COEFF1 13073
+#define COEFF2 -3207
+#define COEFF3 -6664
+#define COEFF4 16530
+
+typedef void ihevcd_fmt_conv_420sp_to_rgba8888_ft(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD32 *pu4_rgba_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_strd,
+ WORD32 is_u_first);
+
+typedef void ihevcd_fmt_conv_420sp_to_rgb565_ft(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD16 *pu2_rgb_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_strd,
+ WORD32 is_u_first);
+
+
+typedef void ihevcd_fmt_conv_420sp_to_420sp_ft(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_uv_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_y_strd,
+ WORD32 dst_uv_strd);
+typedef void ihevcd_fmt_conv_420sp_to_420p_ft(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_u_dst,
+ UWORD8 *pu1_v_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_y_strd,
+ WORD32 dst_uv_strd,
+ WORD32 is_u_first,
+ WORD32 disable_luma_copy);
+
+/* C function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888;
+ihevcd_fmt_conv_420sp_to_rgb565_ft ihevcd_fmt_conv_420sp_to_rgb565;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p;
+
+/* A9Q function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888_a9q;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp_a9q;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_a9q;
+
+/* A9A function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888_a9a;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp_a9a;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_a9a;
+
+/* SSSe31 function declarations */
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_ssse3;
+
+/* SSE4 function declarations */
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_sse42;
+
+/* armv8 function declarations */
+ihevcd_fmt_conv_420sp_to_rgba8888_ft ihevcd_fmt_conv_420sp_to_rgba8888_av8;
+ihevcd_fmt_conv_420sp_to_420sp_ft ihevcd_fmt_conv_420sp_to_420sp_av8;
+ihevcd_fmt_conv_420sp_to_420p_ft ihevcd_fmt_conv_420sp_to_420p_av8;
+
+#endif /* _IHEVCD_FMT_CONV_H_ */
diff --git a/decoder/ihevcd_func_types.h b/decoder/ihevcd_func_types.h
new file mode 100644
index 0000000..232b979
--- /dev/null
+++ b/decoder/ihevcd_func_types.h
@@ -0,0 +1,69 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_func_types.h
+*
+* @brief
+* Defines different types of function implementations Eg C, Cortex A8
+* Intrinsics, Neon assembly etc
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _ihevcd_func_types_H_
+#define _ihevcd_func_types_H_
+
+
+/* C Model : No platform specific intrinsics or inline assemblies */
+#define C 0
+
+/* Cortex Ax intrinsics */
+#define CXAINTR 10
+
+/* Neon intrinsics */
+#define NEONINTR 11
+
+/* X86 intrinsics */
+#define X86INTR 12
+
+/* X64 intrinsics */
+#define X64INTR 13
+
+/* Atom intrinsics */
+#define ATOMINTR 14
+
+/* Cortex Ax assembly */
+#define CXAASM 20
+
+/* Neon assembly */
+#define NEONASM 21
+
+/* X86 assembly */
+#define X86ASM 22
+
+
+#endif /* _ihevcd_func_types_H_ */
diff --git a/decoder/ihevcd_function_selector.h b/decoder/ihevcd_function_selector.h
new file mode 100644
index 0000000..e7d7eee
--- /dev/null
+++ b/decoder/ihevcd_function_selector.h
@@ -0,0 +1,189 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_function_selector.h
+ *
+ * @brief
+ * Structure definitions used in the decoder
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_FUNCTION_SELECTOR_H_
+#define _IHEVCD_FUNCTION_SELECTOR_H_
+
+#include "ihevc_deblk.h"
+#include "ihevc_itrans.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_chroma_intra_pred.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_recon.h"
+#include "ihevc_intra_pred.h"
+#include "ihevc_inter_pred.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_padding.h"
+#include "ihevc_weighted_pred.h"
+#include "ihevc_sao.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_itrans_recon_dc.h"
+
+#define D_ARCH_NA 1
+#define D_ARCH_ARM_NONEON 2
+#define D_ARCH_ARM_A9Q 3
+#define D_ARCH_ARM_A9A 4
+#define D_ARCH_ARM_A9 5
+#define D_ARCH_ARM_A7 6
+#define D_ARCH_ARM_A5 7
+#define D_ARCH_ARM_A15 8
+#define D_ARCH_ARM_NEONINTR 9
+#define D_ARCH_ARMV8_GENERIC 10
+#define D_ARCH_X86_GENERIC 11
+#define D_ARCH_X86_SSSE3 12
+#define D_ARCH_X86_SSE42 13
+#define D_ARCH_X86_AVX2 14
+#define D_ARCH_MIPS_GENERIC 15
+#define D_ARCH_MIPS_32 16
+
+void ihevcd_init_arch(void *pv_codec);
+
+void ihevcd_init_function_ptr(void *pv_codec);
+
+void ihevcd_init_function_ptr_generic(void *pv_codec);
+void ihevcd_init_function_ptr_ssse3(void *pv_codec);
+void ihevcd_init_function_ptr_sse42(void *pv_codec);
+
+#ifndef DISABLE_AVX2
+void ihevcd_init_function_ptr_avx2(void *pv_codec);
+#endif
+
+typedef struct
+{
+ ihevc_deblk_chroma_horz_ft *ihevc_deblk_chroma_horz_fptr;
+ ihevc_deblk_chroma_vert_ft *ihevc_deblk_chroma_vert_fptr;
+ ihevc_deblk_luma_vert_ft *ihevc_deblk_luma_vert_fptr;
+ ihevc_deblk_luma_horz_ft *ihevc_deblk_luma_horz_fptr;
+
+ ihevc_inter_pred_ft *ihevc_inter_pred_chroma_copy_fptr;
+ ihevc_inter_pred_w16out_ft *ihevc_inter_pred_chroma_copy_w16out_fptr;
+ ihevc_inter_pred_ft *ihevc_inter_pred_chroma_horz_fptr;
+ ihevc_inter_pred_w16out_ft *ihevc_inter_pred_chroma_horz_w16out_fptr;
+ ihevc_inter_pred_ft *ihevc_inter_pred_chroma_vert_fptr;
+ ihevc_inter_pred_w16inp_ft *ihevc_inter_pred_chroma_vert_w16inp_fptr;
+ ihevc_inter_pred_w16inp_w16out_ft *ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr;
+ ihevc_inter_pred_w16out_ft *ihevc_inter_pred_chroma_vert_w16out_fptr;
+ ihevc_inter_pred_ft *ihevc_inter_pred_luma_horz_fptr;
+ ihevc_inter_pred_ft *ihevc_inter_pred_luma_vert_fptr;
+ ihevc_inter_pred_w16out_ft *ihevc_inter_pred_luma_vert_w16out_fptr;
+ ihevc_inter_pred_w16inp_ft *ihevc_inter_pred_luma_vert_w16inp_fptr;
+ ihevc_inter_pred_ft *ihevc_inter_pred_luma_copy_fptr;
+ ihevc_inter_pred_w16out_ft *ihevc_inter_pred_luma_copy_w16out_fptr;
+ ihevc_inter_pred_w16out_ft *ihevc_inter_pred_luma_horz_w16out_fptr;
+ ihevc_inter_pred_w16inp_w16out_ft *ihevc_inter_pred_luma_vert_w16inp_w16out_fptr;
+
+ ihevc_intra_pred_chroma_ref_substitution_ft *ihevc_intra_pred_chroma_ref_substitution_fptr;
+ ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
+ ihevc_intra_pred_luma_ref_subst_all_avlble_ft *ihevc_intra_pred_luma_ref_subst_all_avlble_fptr;
+ ihevc_intra_pred_ref_filtering_ft *ihevc_intra_pred_ref_filtering_fptr;
+ ihevc_intra_pred_chroma_dc_ft *ihevc_intra_pred_chroma_dc_fptr;
+ ihevc_intra_pred_chroma_horz_ft *ihevc_intra_pred_chroma_horz_fptr;
+ ihevc_intra_pred_chroma_mode2_ft *ihevc_intra_pred_chroma_mode2_fptr;
+ ihevc_intra_pred_chroma_mode_18_34_ft *ihevc_intra_pred_chroma_mode_18_34_fptr;
+ ihevc_intra_pred_chroma_mode_27_to_33_ft *ihevc_intra_pred_chroma_mode_27_to_33_fptr;
+ ihevc_intra_pred_chroma_mode_3_to_9_ft *ihevc_intra_pred_chroma_mode_3_to_9_fptr;
+ ihevc_intra_pred_chroma_planar_ft *ihevc_intra_pred_chroma_planar_fptr;
+ ihevc_intra_pred_chroma_ver_ft *ihevc_intra_pred_chroma_ver_fptr;
+ ihevc_intra_pred_chroma_mode_11_to_17_ft *ihevc_intra_pred_chroma_mode_11_to_17_fptr;
+ ihevc_intra_pred_chroma_mode_19_to_25_ft *ihevc_intra_pred_chroma_mode_19_to_25_fptr;
+ ihevc_intra_pred_luma_mode_11_to_17_ft *ihevc_intra_pred_luma_mode_11_to_17_fptr;
+ ihevc_intra_pred_luma_mode_19_to_25_ft *ihevc_intra_pred_luma_mode_19_to_25_fptr;
+ ihevc_intra_pred_luma_dc_ft *ihevc_intra_pred_luma_dc_fptr;
+ ihevc_intra_pred_luma_horz_ft *ihevc_intra_pred_luma_horz_fptr;
+ ihevc_intra_pred_luma_mode2_ft *ihevc_intra_pred_luma_mode2_fptr;
+ ihevc_intra_pred_luma_mode_18_34_ft *ihevc_intra_pred_luma_mode_18_34_fptr;
+ ihevc_intra_pred_luma_mode_27_to_33_ft *ihevc_intra_pred_luma_mode_27_to_33_fptr;
+ ihevc_intra_pred_luma_mode_3_to_9_ft *ihevc_intra_pred_luma_mode_3_to_9_fptr;
+ ihevc_intra_pred_luma_planar_ft *ihevc_intra_pred_luma_planar_fptr;
+ ihevc_intra_pred_luma_ver_ft *ihevc_intra_pred_luma_ver_fptr;
+ ihevc_itrans_4x4_ttype1_ft *ihevc_itrans_4x4_ttype1_fptr;
+ ihevc_itrans_4x4_ft *ihevc_itrans_4x4_fptr;
+ ihevc_itrans_8x8_ft *ihevc_itrans_8x8_fptr;
+ ihevc_itrans_16x16_ft *ihevc_itrans_16x16_fptr;
+ ihevc_itrans_32x32_ft *ihevc_itrans_32x32_fptr;
+ ihevc_itrans_recon_4x4_ttype1_ft *ihevc_itrans_recon_4x4_ttype1_fptr;
+ ihevc_itrans_recon_4x4_ft *ihevc_itrans_recon_4x4_fptr;
+ ihevc_itrans_recon_8x8_ft *ihevc_itrans_recon_8x8_fptr;
+ ihevc_itrans_recon_16x16_ft *ihevc_itrans_recon_16x16_fptr;
+ ihevc_itrans_recon_32x32_ft *ihevc_itrans_recon_32x32_fptr;
+ ihevc_chroma_itrans_recon_4x4_ft *ihevc_chroma_itrans_recon_4x4_fptr;
+ ihevc_chroma_itrans_recon_8x8_ft *ihevc_chroma_itrans_recon_8x8_fptr;
+ ihevc_chroma_itrans_recon_16x16_ft *ihevc_chroma_itrans_recon_16x16_fptr;
+ ihevc_recon_4x4_ttype1_ft *ihevc_recon_4x4_ttype1_fptr;
+ ihevc_recon_4x4_ft *ihevc_recon_4x4_fptr;
+ ihevc_recon_8x8_ft *ihevc_recon_8x8_fptr;
+ ihevc_recon_16x16_ft *ihevc_recon_16x16_fptr;
+ ihevc_recon_32x32_ft *ihevc_recon_32x32_fptr;
+ ihevc_chroma_recon_4x4_ft *ihevc_chroma_recon_4x4_fptr;
+ ihevc_chroma_recon_8x8_ft *ihevc_chroma_recon_8x8_fptr;
+ ihevc_chroma_recon_16x16_ft *ihevc_chroma_recon_16x16_fptr;
+ ihevc_memcpy_mul_8_ft *ihevc_memcpy_mul_8_fptr;
+ ihevc_memcpy_ft *ihevc_memcpy_fptr;
+ ihevc_memset_mul_8_ft *ihevc_memset_mul_8_fptr;
+ ihevc_memset_ft *ihevc_memset_fptr;
+ ihevc_memset_16bit_mul_8_ft *ihevc_memset_16bit_mul_8_fptr;
+ ihevc_memset_16bit_ft *ihevc_memset_16bit_fptr;
+ ihevc_pad_left_luma_ft *ihevc_pad_left_luma_fptr;
+ ihevc_pad_left_chroma_ft *ihevc_pad_left_chroma_fptr;
+ ihevc_pad_right_luma_ft *ihevc_pad_right_luma_fptr;
+ ihevc_pad_right_chroma_ft *ihevc_pad_right_chroma_fptr;
+ ihevc_weighted_pred_bi_ft *ihevc_weighted_pred_bi_fptr;
+ ihevc_weighted_pred_bi_default_ft *ihevc_weighted_pred_bi_default_fptr;
+ ihevc_weighted_pred_uni_ft *ihevc_weighted_pred_uni_fptr;
+ ihevc_weighted_pred_chroma_bi_ft *ihevc_weighted_pred_chroma_bi_fptr;
+ ihevc_weighted_pred_chroma_bi_default_ft *ihevc_weighted_pred_chroma_bi_default_fptr;
+ ihevc_weighted_pred_chroma_uni_ft *ihevc_weighted_pred_chroma_uni_fptr;
+ ihevc_sao_band_offset_luma_ft *ihevc_sao_band_offset_luma_fptr;
+ ihevc_sao_band_offset_chroma_ft *ihevc_sao_band_offset_chroma_fptr;
+ ihevc_sao_edge_offset_class0_ft *ihevc_sao_edge_offset_class0_fptr;
+ ihevc_sao_edge_offset_class0_chroma_ft *ihevc_sao_edge_offset_class0_chroma_fptr;
+ ihevc_sao_edge_offset_class1_ft *ihevc_sao_edge_offset_class1_fptr;
+ ihevc_sao_edge_offset_class1_chroma_ft *ihevc_sao_edge_offset_class1_chroma_fptr;
+ ihevc_sao_edge_offset_class2_ft *ihevc_sao_edge_offset_class2_fptr;
+ ihevc_sao_edge_offset_class2_chroma_ft *ihevc_sao_edge_offset_class2_chroma_fptr;
+ ihevc_sao_edge_offset_class3_ft *ihevc_sao_edge_offset_class3_fptr;
+ ihevc_sao_edge_offset_class3_chroma_ft *ihevc_sao_edge_offset_class3_chroma_fptr;
+ ihevcd_fmt_conv_420sp_to_rgba8888_ft *ihevcd_fmt_conv_420sp_to_rgba8888_fptr;
+ ihevcd_fmt_conv_420sp_to_rgb565_ft *ihevcd_fmt_conv_420sp_to_rgb565_fptr;
+ ihevcd_fmt_conv_420sp_to_420sp_ft *ihevcd_fmt_conv_420sp_to_420sp_fptr;
+ ihevcd_fmt_conv_420sp_to_420p_ft *ihevcd_fmt_conv_420sp_to_420p_fptr;
+ ihevcd_itrans_recon_dc_luma_ft *ihevcd_itrans_recon_dc_luma_fptr;
+ ihevcd_itrans_recon_dc_chroma_ft *ihevcd_itrans_recon_dc_chroma_fptr;
+}func_selector_t;
+
+#endif /* _IHEVCD_FUNCTION_SELECTOR_H_ */
diff --git a/decoder/ihevcd_get_mv.c b/decoder/ihevcd_get_mv.c
new file mode 100644
index 0000000..e0d89c7
--- /dev/null
+++ b/decoder/ihevcd_get_mv.c
@@ -0,0 +1,593 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_get_mv.c
+ *
+ * @brief
+ * Contains functions to compute motion vectors
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_get_mv_ctb()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_mv_merge.h"
+#include "ihevcd_mv_pred.h"
+#include "ihevcd_profile.h"
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function computes and stores MV's of all the PU's in CTB
+ *
+ * @par Description:
+ * MV's of a PU will be stored in PU structure. MV computation can be merge or mv pred
+ *
+ * @param[in] ps_proc
+ * processor context
+ *
+ * @param[in] pi4_ctb_top_pu_idx
+ * Pointer to ctb top PU indices
+ *
+ * @param[in] pi4_ctb_left_pu_idx
+ * Pointer to ctb left PU indices
+ *
+ * @param[in] pi4_ctb_top_left_pu_idx
+ * Pointer to ctb top left PU indices
+ *
+ * @returns
+ * number of PU's per ctb
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+WORD32 ihevcd_get_mv_ctb(mv_ctxt_t *ps_mv_ctxt,
+ UWORD32 *pu4_ctb_top_pu_idx,
+ UWORD32 *pu4_ctb_left_pu_idx,
+ UWORD32 *pu4_ctb_top_left_pu_idx)
+{
+
+ WORD32 i;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ pu_t *ps_pu;
+ tile_t *ps_tile;
+ UWORD8 *pu1_pic_pu_map_ctb;
+ WORD32 num_minpu_in_ctb;
+ WORD32 ctb_start_pu_idx;
+ UWORD32 *pu4_top_pu_idx, *pu4_left_pu_idx, *pu4_top_left_pu_idx;
+ WORD32 pu_x_in_4x4, pu_y_in_4x4;
+ WORD32 pu_x_in_4x4_single_mcl, pu_y_in_4x4_single_mcl;
+ pu_mv_t s_pred_mv;
+ WORD32 ctb_size, ctb_size_in_min_pu;
+ WORD32 num_pu_per_ctb, pu_wd, pu_ht, pu_cnt;
+ WORD32 pu_wd_single_mcl, pu_ht_single_mcl;
+ UWORD32 au4_nbr_avail[MAX_CTB_SIZE / MIN_PU_SIZE
+ + 2 /* Top nbr + bot nbr */];
+ UWORD32 *pu4_nbr_pu_idx/* (Left + ctb_size + right ) * (top + ctb_size + bottom) */;
+ WORD32 top_avail_bits;
+ UWORD8 u1_lb_avail, u1_l_avail, u1_t_avail, u1_tr_avail, u1_tl_avail;
+ WORD32 nbr_pu_idx_strd;
+ WORD32 cb_size;
+ WORD32 single_mcl_flag;
+
+ PROFILE_DISABLE_MV_PREDICTION();
+ ps_sps = ps_mv_ctxt->ps_sps;
+ ps_pps = ps_mv_ctxt->ps_pps;
+ ps_pu = ps_mv_ctxt->ps_pu;
+ ps_tile = ps_mv_ctxt->ps_tile;
+
+ pu4_nbr_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx_map;
+
+ ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+ ctb_size_in_min_pu = (ctb_size / MIN_PU_SIZE);
+
+ num_minpu_in_ctb = ctb_size_in_min_pu * ctb_size_in_min_pu;
+ pu1_pic_pu_map_ctb = ps_mv_ctxt->pu1_pic_pu_map + (ps_mv_ctxt->i4_ctb_x + ps_mv_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb) * num_minpu_in_ctb;
+
+ num_pu_per_ctb = ps_mv_ctxt->i4_ctb_pu_cnt;
+ ctb_start_pu_idx = ps_mv_ctxt->i4_ctb_start_pu_idx;
+ nbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+
+ {
+ /* Updating the initial availability map */
+ WORD32 i;
+ UWORD32 u4_left_ctb_avail, u4_top_lt_ctb_avail, u4_top_rt_ctb_avail,
+ u4_top_ctb_avail;
+
+ u4_left_ctb_avail = ps_mv_ctxt->u1_left_ctb_avail;
+ u4_top_lt_ctb_avail = ps_mv_ctxt->u1_top_lt_ctb_avail;
+ u4_top_ctb_avail = ps_mv_ctxt->u1_top_ctb_avail;
+ u4_top_rt_ctb_avail = ps_mv_ctxt->u1_top_rt_ctb_avail;
+
+ /* Initializing the availability array */
+ memset(au4_nbr_avail, 0,
+ (MAX_CTB_SIZE / MIN_PU_SIZE + 2) * sizeof(UWORD32));
+ /* Initializing the availability array with CTB level availability flags */
+ {
+ WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+ - (ps_mv_ctxt->i4_ctb_y << ps_sps->i1_log2_ctb_size);
+ WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+ for(i = 0; i < ctb_size_left / MIN_PU_SIZE; i++)
+ {
+ au4_nbr_avail[i + 1] = (u4_left_ctb_avail << 31);
+ }
+ }
+ au4_nbr_avail[0] |= ((u4_top_rt_ctb_avail << 31)
+ >> (1 + ctb_size_in_min_pu)); /* 1+ctb_size/4 position bit pos from msb */
+
+ au4_nbr_avail[0] |= (u4_top_lt_ctb_avail << 31);
+ {
+ WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples
+ - (ps_mv_ctxt->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+ WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
+ WORD32 shift = (31 - (ctb_size / MIN_TU_SIZE));
+
+ /* ctb_size_top gives number of valid pixels remaining in the current row */
+ /* Since we need pattern of 1's starting from the MSB, an additional shift */
+ /* is needed */
+ shift += ((ctb_size - ctb_size_top) / MIN_TU_SIZE);
+
+ top_avail_bits = ((1 << (ctb_size_top / MIN_PU_SIZE)) - 1) << shift;
+ }
+
+ au4_nbr_avail[0] |= ((u4_top_ctb_avail == 1) ? top_avail_bits : 0x0);
+ /* Starting from msb 2nd bit to (1+ctb_size/4) bit, set 1 if top avail,or 0 */
+
+ }
+
+ {
+ /* In case of a tile boundary, left and top arrays must change*/
+ /*Left*/
+ /* If start of tile row*/
+ if(((ps_tile->u1_pos_x) == (ps_mv_ctxt->i4_ctb_x)) && (ps_mv_ctxt->i4_ctb_x != 0))
+ {
+ WORD32 index_pic_map;
+ WORD32 ctb_pu_idx;
+ UWORD8 *pu1_pic_pu_map;
+
+ /* Goto the left ctb which belongs to another tile */
+ index_pic_map = ((ps_mv_ctxt->i4_ctb_x - 1) + ps_mv_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ ctb_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx[index_pic_map];
+ index_pic_map *= num_minpu_in_ctb;
+
+ /*Replicate the PUs of the last column of the left ctb*/
+ pu1_pic_pu_map = ps_mv_ctxt->pu1_pic_pu_map + index_pic_map + ctb_size_in_min_pu - 1;
+ for(i = 0; i < ctb_size_in_min_pu; i++)
+ {
+ /* Left neighbors change*/
+ pu4_ctb_left_pu_idx[i] = ctb_pu_idx + (WORD32)*pu1_pic_pu_map;
+ pu1_pic_pu_map = pu1_pic_pu_map + ctb_size_in_min_pu;
+ }
+
+
+ index_pic_map = ((ps_mv_ctxt->i4_ctb_x - 1) + (ps_mv_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+ ctb_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx[index_pic_map];
+ index_pic_map *= num_minpu_in_ctb;
+ index_pic_map += (num_minpu_in_ctb - 1);
+ pu4_ctb_top_left_pu_idx[0] = ctb_pu_idx + pu1_pic_pu_map[index_pic_map];
+ }
+ /*Top*/
+ /* If start of tile column*/
+ if(((ps_tile->u1_pos_y) == (ps_mv_ctxt->i4_ctb_y)) && (ps_mv_ctxt->i4_ctb_y != 0))
+ {
+ WORD32 index_pic_map;
+ WORD32 ctb_pu_idx;
+ UWORD8 *pu1_pic_pu_map;
+
+ /* Goto the top ctb which belongs to another tile */
+ index_pic_map = (ps_mv_ctxt->i4_ctb_x) + ((ps_mv_ctxt->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+ ctb_pu_idx = ps_mv_ctxt->pu4_pic_pu_idx[index_pic_map];
+ index_pic_map *= num_minpu_in_ctb;
+
+ /*Replicate the PUs of the last row of the top ctb*/
+ pu1_pic_pu_map = ps_mv_ctxt->pu1_pic_pu_map + index_pic_map + (ctb_size_in_min_pu * (ctb_size_in_min_pu - 1));
+ for(i = 0; i < ctb_size_in_min_pu; i++)
+ {
+ /* Top neighbors change*/
+ pu4_ctb_top_pu_idx[i] = ctb_pu_idx + (WORD32)*pu1_pic_pu_map;
+ pu1_pic_pu_map++;
+ }
+ }
+
+ /* Updating the initial neighbor pu idx map */
+ /* Initializing the availability array with CTB level availability flags */
+ /* 16x16 array for holding pu info of the ctb, wrt the frame pu count*/
+ for(i = 0; i < ctb_size_in_min_pu; i++)
+ {
+ /* Left */
+ pu4_nbr_pu_idx[(i + 1) * nbr_pu_idx_strd] = pu4_ctb_left_pu_idx[i];
+ /* Top */
+ pu4_nbr_pu_idx[i + 1] = pu4_ctb_top_pu_idx[i];
+ }
+ /* Top right */
+ pu4_nbr_pu_idx[1 + ctb_size_in_min_pu] = pu4_ctb_top_pu_idx[ctb_size_in_min_pu];
+
+ /* Top left */
+ pu4_nbr_pu_idx[0] = pu4_ctb_top_left_pu_idx[0];
+
+ }
+
+ /* CTB level MV pred */
+ for(pu_cnt = 0; pu_cnt < num_pu_per_ctb; pu_cnt++, ps_pu++)
+ {
+ pu_ht = (ps_pu->b4_ht + 1) << 2;
+ pu_wd = (ps_pu->b4_wd + 1) << 2;
+
+ pu_ht_single_mcl = pu_ht;
+ pu_wd_single_mcl = pu_wd;
+
+ pu_x_in_4x4 = ps_pu->b4_pos_x;
+ pu_y_in_4x4 = ps_pu->b4_pos_y;
+
+ pu_x_in_4x4_single_mcl = pu_x_in_4x4;
+ pu_y_in_4x4_single_mcl = pu_y_in_4x4;
+
+ /*******************************************/
+ /* Neighbor location: Graphical indication */
+ /* */
+ /* B2 _____________B1 B0 */
+ /* | | */
+ /* | | */
+ /* | | */
+ /* | PU ht| */
+ /* | | */
+ /* | | */
+ /* A1|______wd_______| */
+ /* A0 */
+ /* */
+ /*******************************************/
+ /* Below code is for merge mode, where if single_mcl_flag == 1,
+ * all the prediction units of the current coding unit share a
+ * single merge candidate list, which is identical to the
+ * merge candidate list of the 2Nx2N prediction unit.
+ */
+ single_mcl_flag = 0;
+ if(1 == ps_pu->b1_merge_flag)
+ {
+ cb_size = MAX(pu_wd_single_mcl, pu_ht_single_mcl);
+ cb_size = MAX(cb_size,
+ (1 << ps_sps->i1_log2_min_coding_block_size));
+ if((ps_pps->i1_log2_parallel_merge_level > 2) && cb_size == 8 && (pu_wd_single_mcl != pu_ht_single_mcl))
+ {
+ single_mcl_flag = 1;
+ if((PART_Nx2N == ps_pu->b3_part_mode) && (1 == ps_pu->b2_part_idx))
+ {
+ pu_x_in_4x4_single_mcl = pu_x_in_4x4_single_mcl - 1;
+ }
+ else if((PART_2NxN == ps_pu->b3_part_mode) && (1 == ps_pu->b2_part_idx))
+ {
+ pu_y_in_4x4_single_mcl = pu_y_in_4x4_single_mcl - 1;
+ }
+ pu_ht_single_mcl = 8;
+ pu_wd_single_mcl = 8;
+ }
+ }
+ pu4_top_pu_idx = &pu4_nbr_pu_idx[(1 + pu_x_in_4x4_single_mcl)
+ + (1 + pu_y_in_4x4_single_mcl - 1) * nbr_pu_idx_strd];
+ pu4_top_left_pu_idx = pu4_top_pu_idx - 1;
+ pu4_left_pu_idx = pu4_top_pu_idx - 1 + nbr_pu_idx_strd;
+
+ /* Get neibhbor availability */
+ {
+ u1_lb_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl + pu_ht_single_mcl / MIN_PU_SIZE]
+ >> (31 - (1 + pu_x_in_4x4_single_mcl - 1))) & 1;
+ u1_l_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl]
+ >> (31 - (1 + pu_x_in_4x4_single_mcl - 1))) & 1;
+ u1_t_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl - 1]
+ >> (31 - (1 + pu_x_in_4x4_single_mcl))) & 1;
+ u1_tr_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl - 1]
+ >> (31 - (1 + pu_x_in_4x4_single_mcl + pu_wd_single_mcl / MIN_PU_SIZE)))
+ & 1;
+ u1_tl_avail = (au4_nbr_avail[1 + pu_y_in_4x4_single_mcl - 1]
+ >> (31 - (1 + pu_x_in_4x4_single_mcl - 1))) & 1;
+ }
+ if(ps_pu->b1_intra_flag == 0)
+ {
+ if(ps_pu->b1_merge_flag == 0)
+ {
+ WORD32 pred_flag_l0, pred_flag_l1;
+ WORD32 tmp_x, tmp_y, mvd_x, mvd_y, mvp_x, mvp_y;
+ WORD32 two_pow_16, two_pow_15;
+#if 0
+ if( !pred_flag_l0 )
+ {
+ s_pu_temp.mv.i1_l0_ref_idx = -1;
+ s_pu_temp.b2_pred_mode = 3;
+ }
+ if( !pred_flag_l1 )
+ {
+ s_pu_temp.mv.i1_l1_ref_idx = -1;
+ s_pu_temp.b2_pred_mode = 3;
+ }
+#endif
+
+ ihevcd_mv_pred(ps_mv_ctxt, pu4_top_pu_idx, pu4_left_pu_idx,
+ pu4_top_left_pu_idx, nbr_pu_idx_strd,
+ ps_pu, u1_lb_avail, u1_l_avail,
+ u1_tr_avail, u1_t_avail, u1_tl_avail,
+ &s_pred_mv);
+
+ pred_flag_l0 = (ps_pu->b2_pred_mode != PRED_L1);
+ pred_flag_l1 = (ps_pu->b2_pred_mode != PRED_L0);
+
+ two_pow_16 = (1 << 16);
+ two_pow_15 = (1 << 15);
+
+ /* L0 MV */
+ if(pred_flag_l0)
+ {
+ mvp_x = s_pred_mv.s_l0_mv.i2_mvx;
+ mvp_y = s_pred_mv.s_l0_mv.i2_mvy;
+ mvd_x = ps_pu->mv.s_l0_mv.i2_mvx;
+ mvd_y = ps_pu->mv.s_l0_mv.i2_mvy;
+
+ tmp_x = (mvp_x + mvd_x + two_pow_16) & (two_pow_16 - 1);
+ tmp_x = tmp_x >= two_pow_15 ?
+ (tmp_x - two_pow_16) : tmp_x;
+ ps_pu->mv.s_l0_mv.i2_mvx = tmp_x;
+ tmp_y = (mvp_y + mvd_y + two_pow_16) & (two_pow_16 - 1);
+ tmp_y = tmp_y >= two_pow_15 ?
+ (tmp_y - two_pow_16) : tmp_y;
+ ps_pu->mv.s_l0_mv.i2_mvy = tmp_y;
+ }
+ /* L1 MV */
+ if(pred_flag_l1)
+ {
+ mvp_x = s_pred_mv.s_l1_mv.i2_mvx;
+ mvp_y = s_pred_mv.s_l1_mv.i2_mvy;
+ mvd_x = ps_pu->mv.s_l1_mv.i2_mvx;
+ mvd_y = ps_pu->mv.s_l1_mv.i2_mvy;
+
+ tmp_x = (mvp_x + mvd_x + two_pow_16) & (two_pow_16 - 1);
+ tmp_x = tmp_x >= two_pow_15 ?
+ (tmp_x - two_pow_16) : tmp_x;
+ ps_pu->mv.s_l1_mv.i2_mvx = tmp_x;
+ tmp_y = (mvp_y + mvd_y + two_pow_16) & (two_pow_16 - 1);
+ tmp_y = tmp_y >= two_pow_15 ?
+ (tmp_y - two_pow_16) : tmp_y;
+ ps_pu->mv.s_l1_mv.i2_mvy = tmp_y;
+ }
+ }
+ else
+ {
+ WORD32 part_mode;
+ WORD32 part_idx;
+#if 0
+ /* For all part_modes other than PART_2Nx2N, max of PU width and PU height gives
+ * Coding block size.
+ * To differentiate between PART_2Nx2N and PART_NxN
+ * PART_NxN is possible only when CB size is min CB size
+ * So if pu width and pu height are equal and they are half of min CB size, then current is PART_NxN
+ */
+ if(pu_wd == pu_ht)
+ {
+ part_mode = PART_2Nx2N;
+ if(2 * pu_wd == (ps_sps->i1_log2_min_coding_block_size << 2))
+ part_mode = PART_NxN;
+ }
+ else
+ {
+
+ if(pu_wd == cb_size)
+ {
+ /* Part Mode is either PART_2NxN or PART_2NxnU or PART_2NxnD */
+ /* Since the exact mode is not really needed, it is set to PART_2NxN */
+ part_mode = PART_2NxN;
+ }
+ else
+ {
+ /* Part Mode is either PART_Nx2N or PART_nLx2N or PART_nRx2N */
+ /* Since the exact mode is not really needed, it is set to PART_Nx2N */
+ part_mode = PART_Nx2N;
+ }
+
+ }
+#else
+ part_mode = ps_pu->b3_part_mode;
+#endif
+ //TODO: Get part_idx
+ part_idx = ps_pu->b2_part_idx;
+
+ ihevcd_mv_merge(ps_mv_ctxt, pu4_top_pu_idx, pu4_left_pu_idx,
+ nbr_pu_idx_strd, ps_pu, part_mode,
+ part_idx, pu_wd_single_mcl, pu_ht_single_mcl,
+ pu_x_in_4x4_single_mcl << 2, pu_y_in_4x4_single_mcl << 2,
+ single_mcl_flag, u1_lb_avail, u1_l_avail, u1_tr_avail,
+ u1_t_avail, u1_tl_avail);
+
+ if(PRED_BI == ps_pu->b2_pred_mode)
+ {
+ if(((ps_pu->b3_part_mode == PART_2NxN) && (pu_wd == 8))
+ || ((ps_pu->b3_part_mode == PART_Nx2N)
+ && (pu_ht == 8)))
+ {
+ ps_pu->b2_pred_mode = PRED_L0;
+ }
+ }
+ }
+#if DEBUG_PRINT_MV
+ printf("\n-----------------------");
+ printf("\n CTB X = %d, Y = %d",
+ ps_mv_ctxt->i4_ctb_x, ps_mv_ctxt->i4_ctb_y);
+ printf("\n pu_x = %d, pu_y = %d",
+ (pu_x_in_4x4 * 4), (pu_y_in_4x4 * 4));
+ printf("\n pu_wd = %d, pu_ht = %d", pu_wd, pu_ht);
+ if(ps_pu->b2_pred_mode == PRED_L0)
+ printf("\n Pred = 0,Ref_idx = %d, MV l0 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.s_l0_mv.i2_mvx,
+ ps_pu->mv.s_l0_mv.i2_mvy);
+ else if(ps_pu->b2_pred_mode == PRED_L1)
+ printf("\n Pred = 1,Ref_idx = %d, MV l1 = %4d %4d", ps_pu->mv.i1_l1_ref_idx, ps_pu->mv.s_l1_mv.i2_mvx,
+ ps_pu->mv.s_l1_mv.i2_mvy);
+ else
+ printf("\n Pred = 2,Ref_idx = %d,Ref_idx = %d, MV l0 = %4d %4d, MV l1 = %4d %4d", ps_pu->mv.i1_l0_ref_idx, ps_pu->mv.i1_l1_ref_idx,
+ ps_pu->mv.s_l0_mv.i2_mvx, ps_pu->mv.s_l0_mv.i2_mvy,
+ ps_pu->mv.s_l1_mv.i2_mvx, ps_pu->mv.s_l1_mv.i2_mvy);
+
+#endif
+ }
+
+ {
+ slice_header_t *ps_slice_hdr;
+ pic_buf_t *ps_pic_buf_l0, *ps_pic_buf_l1;
+ ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+ ps_pic_buf_l0 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list0[ps_pu->mv.i1_l0_ref_idx].pv_pic_buf));
+ ps_pic_buf_l1 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list1[ps_pu->mv.i1_l1_ref_idx].pv_pic_buf));
+ ps_pu->mv.i1_l0_ref_pic_buf_id = ps_pic_buf_l0->u1_buf_id;
+ if(BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ ps_pu->mv.i1_l1_ref_pic_buf_id = ps_pic_buf_l1->u1_buf_id;
+ }
+ }
+
+ /* Neighbor availability inside CTB */
+ /* 1bit per 4x4. Indicates whether that 4x4 block has been reconstructed(avialable) */
+ /* Used for neighbor availability in intra pred */
+ {
+ WORD32 trans_in_min_tu;
+ UWORD32 cur_tu_in_bits;
+ UWORD32 cur_tu_avail_flag;
+
+ trans_in_min_tu = pu_wd / MIN_PU_SIZE;
+ cur_tu_in_bits = (1 << trans_in_min_tu) - 1;
+ cur_tu_in_bits = cur_tu_in_bits << (32 - trans_in_min_tu);
+
+ cur_tu_avail_flag = cur_tu_in_bits >> (pu_x_in_4x4 + 1);
+
+ for(i = 0; i < pu_ht / MIN_PU_SIZE; i++)
+ au4_nbr_avail[1 + pu_y_in_4x4 + i] |= cur_tu_avail_flag;
+ }
+
+ /* Neighbor PU idx update inside CTB */
+ /* 1byte per 4x4. Indicates the PU idx that 4x4 block belongs to */
+
+ {
+ WORD32 row, col;
+ UWORD32 cur_pu_idx;
+ WORD32 offset;
+ cur_pu_idx = ctb_start_pu_idx + pu_cnt;
+
+ offset = (1 + pu_x_in_4x4 + 0) + (1 + pu_y_in_4x4 + 0) * nbr_pu_idx_strd;
+
+ for(row = 0; row < pu_ht / MIN_PU_SIZE; row++)
+ {
+ for(col = 0; col < pu_wd / MIN_PU_SIZE; col++)
+ {
+ pu4_nbr_pu_idx[offset + col] = cur_pu_idx;
+ }
+ offset += nbr_pu_idx_strd;
+ }
+ }
+
+ }
+
+ /* Updating Top and Left pointers */
+ {
+ WORD32 offset_top, offset_left;
+
+ offset_left = ctb_size_in_min_pu + (0 + 1) * nbr_pu_idx_strd;
+ offset_top = ctb_size_in_min_pu * nbr_pu_idx_strd + 0 + 1;
+
+ /* Top Left */
+ /* saving top left before updating top ptr, as updating top ptr will overwrite the top left for the next ctb */
+ pu4_ctb_top_left_pu_idx[0] = pu4_ctb_top_pu_idx[ctb_size_in_min_pu - 1];
+
+ for(i = 0; i < ctb_size_in_min_pu; i++)
+ {
+ /* Left */
+ /* Last column of au4_nbr_pu_idx */
+ pu4_ctb_left_pu_idx[i] = pu4_nbr_pu_idx[offset_left];
+ /* Top */
+ /* Last row of au4_nbr_pu_idx */
+ pu4_ctb_top_pu_idx[i] = pu4_nbr_pu_idx[offset_top];
+
+ offset_left += nbr_pu_idx_strd;
+ offset_top += 1;
+ }
+ }
+
+#if 1
+ /* Updating the CTB level PU idx (Used for collocated MV pred)*/
+ {
+ WORD32 ctb_row, ctb_col, index_pic_map, index_nbr_map;
+ WORD32 first_pu_of_ctb;
+ first_pu_of_ctb = pu4_nbr_pu_idx[1 + nbr_pu_idx_strd];
+
+ index_pic_map = 0 * ctb_size_in_min_pu + 0;
+ index_nbr_map = (0 + 1) * nbr_pu_idx_strd + (0 + 1);
+
+ for(ctb_row = 0; ctb_row < ctb_size_in_min_pu; ctb_row++)
+ {
+ for(ctb_col = 0; ctb_col < ctb_size_in_min_pu; ctb_col++)
+ {
+ pu1_pic_pu_map_ctb[index_pic_map + ctb_col] = pu4_nbr_pu_idx[index_nbr_map + ctb_col]
+ - first_pu_of_ctb;
+ }
+ index_pic_map += ctb_size_in_min_pu;
+ index_nbr_map += nbr_pu_idx_strd;
+ }
+ }
+#endif
+ return num_pu_per_ctb;
+}
diff --git a/decoder/ihevcd_get_mv.h b/decoder/ihevcd_get_mv.h
new file mode 100644
index 0000000..fd5e86b
--- /dev/null
+++ b/decoder/ihevcd_get_mv.h
@@ -0,0 +1,46 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_slice.h
+*
+* @brief
+* Processing of slice level data
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_GET_MV_H_
+#define IHEVCD_GET_MV_H_
+
+WORD32 ihevcd_get_mv_ctb(mv_ctxt_t *ps_mv_ctxt,
+ UWORD32 *pu4_ctb_top_pu_idx,
+ UWORD32 *pu4_ctb_left_pu_idx,
+ UWORD32 *pu4_ctb_top_left_pu_idx);
+
+
+#endif /* IHEVCD_GET_MV_H_ */
diff --git a/decoder/ihevcd_ilf_padding.c b/decoder/ihevcd_ilf_padding.c
new file mode 100644
index 0000000..9db82e5
--- /dev/null
+++ b/decoder/ihevcd_ilf_padding.c
@@ -0,0 +1,214 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_ilf_padding_frame.c
+*
+* @brief
+* Does frame level loop filtering (deblocking and SAO) and padding
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_ilf_pad_frame()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_deblk.h"
+#include "ihevcd_sao.h"
+#include "ihevc_padding.h"
+
+void ihevcd_ilf_pad_frame(deblk_ctxt_t *ps_deblk_ctxt, sao_ctxt_t *ps_sao_ctxt)
+{
+ sps_t *ps_sps;
+ slice_header_t *ps_slice_hdr;
+ codec_t *ps_codec;
+ WORD32 i4_ctb_x, i4_ctb_y;
+ WORD32 ctb_size;
+
+ ps_sps = ps_deblk_ctxt->ps_sps;
+ ps_slice_hdr = ps_deblk_ctxt->ps_slice_hdr;
+ ps_codec = ps_deblk_ctxt->ps_codec;
+ ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+
+ for(i4_ctb_y = 0; i4_ctb_y < ps_sps->i2_pic_ht_in_ctb; i4_ctb_y++)
+ {
+ for(i4_ctb_x = 0; i4_ctb_x < ps_sps->i2_pic_wd_in_ctb; i4_ctb_x++)
+ {
+ WORD32 i4_is_last_ctb_x = 0;
+ WORD32 i4_is_last_ctb_y = 0;
+
+ /*TODO:
+ * Slice header also has to be updated
+ * */
+ ps_deblk_ctxt->i4_ctb_x = i4_ctb_x;
+ ps_deblk_ctxt->i4_ctb_y = i4_ctb_y;
+
+ ps_sao_ctxt->i4_ctb_x = i4_ctb_x;
+ ps_sao_ctxt->i4_ctb_y = i4_ctb_y;
+
+ if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
+ (0 == ps_codec->i4_disable_deblk_pic))
+ {
+ ihevcd_deblk_ctb(ps_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+
+ /* If the last CTB in the row was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+ * is applied on a shifted CTB structure
+ */
+ if(i4_ctb_x == ps_sps->i2_pic_wd_in_ctb - 1)
+ {
+ WORD32 last_x_pos;
+ i4_is_last_ctb_x = 1;
+ i4_is_last_ctb_y = 0;
+
+
+ last_x_pos = (ps_sps->i2_pic_wd_in_ctb << ps_sps->i1_log2_ctb_size);
+ if(last_x_pos == ps_sps->i2_pic_width_in_luma_samples)
+ {
+ ihevcd_deblk_ctb(ps_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+ }
+ }
+
+
+ /* If the last CTB in the column was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+ * is applied on a shifted CTB structure
+ */
+ if(i4_ctb_y == ps_sps->i2_pic_ht_in_ctb - 1)
+ {
+ WORD32 last_y_pos;
+ i4_is_last_ctb_y = 1;
+ i4_is_last_ctb_x = 0;
+
+ last_y_pos = (ps_sps->i2_pic_ht_in_ctb << ps_sps->i1_log2_ctb_size);
+ if(last_y_pos == ps_sps->i2_pic_height_in_luma_samples)
+ {
+ ihevcd_deblk_ctb(ps_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+ }
+ }
+ }
+
+ if(ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag)
+ {
+ ihevcd_sao_ctb(ps_sao_ctxt);
+ }
+
+ /* Call padding if required */
+ {
+ UWORD8 *pu1_cur_ctb_luma = ps_deblk_ctxt->pu1_cur_pic_luma
+ + (i4_ctb_x * ctb_size
+ + i4_ctb_y * ctb_size
+ * ps_codec->i4_strd);
+ UWORD8 *pu1_cur_ctb_chroma = ps_deblk_ctxt->pu1_cur_pic_chroma
+ + i4_ctb_x * ctb_size
+ + (i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+
+ if(0 == i4_ctb_x)
+ {
+ WORD32 pad_ht_luma;
+ WORD32 pad_ht_chroma;
+
+ pad_ht_luma = ctb_size;
+ pad_ht_luma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+ pad_ht_chroma = ctb_size / 2;
+ pad_ht_chroma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+ /* Pad left after 1st CTB is processed */
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(pu1_cur_ctb_luma - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(pu1_cur_ctb_chroma - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+ }
+ else if((ps_sps->i2_pic_wd_in_ctb - 1) == i4_ctb_x)
+ {
+ WORD32 pad_ht_luma;
+ WORD32 pad_ht_chroma;
+ WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (i4_ctb_x << ps_sps->i1_log2_ctb_size);
+
+ pad_ht_luma = ctb_size;
+ pad_ht_luma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+ pad_ht_chroma = ctb_size / 2;
+ pad_ht_chroma += (ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y ? 8 : 0;
+ /* Pad right after last CTB in the current row is processed */
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(pu1_cur_ctb_luma + cols_remaining - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(pu1_cur_ctb_chroma + cols_remaining - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
+
+
+ if((ps_sps->i2_pic_ht_in_ctb - 1) == i4_ctb_y)
+ {
+ UWORD8 *pu1_buf;
+ /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
+ /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
+ /* Pad top after padding left and right for current rows after processing 1st CTB row */
+ ihevc_pad_top(ps_deblk_ctxt->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
+ ihevc_pad_top(ps_deblk_ctxt->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
+
+ pu1_buf = ps_deblk_ctxt->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
+ /* Pad top after padding left and right for current rows after processing 1st CTB row */
+ ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
+
+ pu1_buf = ps_deblk_ctxt->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
+ ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
+ }
+ }
+ }
+
+
+ }
+ }
+
+}
diff --git a/decoder/ihevcd_ilf_padding.h b/decoder/ihevcd_ilf_padding.h
new file mode 100644
index 0000000..88c9732
--- /dev/null
+++ b/decoder/ihevcd_ilf_padding.h
@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevc_ilf_padding_frame.h
+*
+* @brief
+* Does frame level loop filtering (deblocking and SAO) and padding
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - ihevc_ilf_pad_frame()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_ILF_PADDING_H_
+#define IHEVCD_ILF_PADDING_H_
+
+void ihevcd_ilf_pad_frame(deblk_ctxt_t *ps_deblk_ctxt, sao_ctxt_t *ps_sao_ctxt);
+
+
+#endif /* IHEVCD_ILF_PADDING_H_ */
+
diff --git a/decoder/ihevcd_inter_pred.c b/decoder/ihevcd_inter_pred.c
new file mode 100644
index 0000000..cef3bee
--- /dev/null
+++ b/decoder/ihevcd_inter_pred.c
@@ -0,0 +1,676 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_inter_pred.c
+ *
+ * @brief
+ * Calculates the prediction samples for a given cbt
+ *
+ * @author
+ * Srinivas T
+ *
+ * @par List of Functions:
+ * - ihevc_inter_pred()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_weighted_pred.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+
+#include "ihevc_inter_pred.h"
+#include "ihevcd_profile.h"
+
+WORD8 luma_filter[4][NTAPS_LUMA] =
+{
+ { 0, 0, 0, 64, 0, 0, 0, 0 },
+ { -1, 4, -10, 58, 17, -5, 1, 0 },
+ { -1, 4, -11, 40, 40, -11, 4, -1 },
+ { 0, 1, -5, 17, 58, -10, 4, -1 } };
+
+/* The filter uses only the first four elements in each array */
+WORD8 chroma_filter[8][NTAPS_LUMA] =
+{
+ { 0, 64, 0, 0, 0, 0, 0, 0 },
+ { -2, 58, 10, -2, 0, 0, 0, 0 },
+ { -4, 54, 16, -2, 0, 0, 0, 0 },
+ { -6, 46, 28, -4, 0, 0, 0, 0 },
+ { -4, 36, 36, -4, 0, 0, 0, 0 },
+ { -4, 28, 46, -6, 0, 0, 0, 0 },
+ { -2, 16, 54, -4, 0, 0, 0, 0 },
+ { -2, 10, 58, -2, 0, 0, 0, 0 } };
+
+/**
+*******************************************************************************
+*
+* @brief
+* Inter prediction CTB level function
+*
+* @par Description:
+* For a given CTB, Inter prediction followed by weighted prediction is
+* done for all the PUs present in the CTB
+*
+* @param[in] ps_ctb
+* Pointer to the CTB context
+*
+* @returns
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+void ihevcd_inter_pred_ctb(process_ctxt_t *ps_proc)
+{
+ UWORD8 *ref_pic_luma_l0, *ref_pic_chroma_l0;
+ UWORD8 *ref_pic_luma_l1, *ref_pic_chroma_l1;
+
+ UWORD8 *ref_pic_l0 = NULL, *ref_pic_l1 = NULL;
+
+ slice_header_t *ps_slice_hdr;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ pu_t *ps_pu;
+ codec_t *ps_codec;
+ WORD32 pu_indx;
+ WORD32 pu_x, pu_y;
+ WORD32 pu_wd, pu_ht;
+ WORD32 i4_pu_cnt;
+ WORD32 cur_ctb_idx;
+
+ WORD32 clr_indx;
+ WORD32 ntaps;
+
+
+
+ WORD32 ai2_xint[2] = { 0, 0 }, ai2_yint[2] = { 0, 0 };
+ WORD32 ai2_xfrac[2] = { 0, 0 }, ai2_yfrac[2] = { 0, 0 };
+
+ WORD32 weighted_pred, bi_pred;
+
+ WORD32 ref_strd;
+ UWORD8 *pu1_dst_luma, *pu1_dst_chroma;
+
+ UWORD8 *pu1_dst;
+
+ WORD16 *pi2_tmp1, *pi2_tmp2;
+
+ WORD32 luma_weight_l0, luma_weight_l1;
+ WORD32 chroma_weight_l0_cb, chroma_weight_l1_cb, chroma_weight_l0_cr, chroma_weight_l1_cr;
+ WORD32 luma_offset_l0, luma_offset_l1;
+ WORD32 chroma_offset_l0_cb, chroma_offset_l1_cb, chroma_offset_l0_cr, chroma_offset_l1_cr;
+ WORD32 shift, lvl_shift1, lvl_shift2;
+
+ pf_inter_pred func_ptr1, func_ptr2, func_ptr3, func_ptr4;
+ WORD32 func_indx1, func_indx2, func_indx3, func_indx4;
+ void *func_src;
+ void *func_dst;
+ WORD32 func_src_strd;
+ WORD32 func_dst_strd;
+ WORD8 *func_coeff;
+ WORD32 func_wd;
+ WORD32 func_ht;
+ WORD32 next_ctb_idx;
+ WORD8(*coeff)[8];
+ WORD32 chroma_yuv420sp_vu;
+
+ PROFILE_DISABLE_INTER_PRED();
+ ps_codec = ps_proc->ps_codec;
+ ps_slice_hdr = ps_proc->ps_slice_hdr;
+ ps_pps = ps_proc->ps_pps;
+ ps_sps = ps_proc->ps_sps;
+ cur_ctb_idx = ps_proc->i4_ctb_x
+ + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ /*
+ * In case of tiles, the next ctb belonging to the same tile must be used to get the PU index
+ */
+
+ next_ctb_idx = ps_proc->i4_next_pu_ctb_cnt;
+ i4_pu_cnt = ps_proc->pu4_pic_pu_idx[next_ctb_idx] - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+
+ ps_pu = ps_proc->ps_pu;
+ ref_strd = ps_codec->i4_strd;
+ pi2_tmp1 = ps_proc->pi2_inter_pred_tmp_buf1;
+ pi2_tmp2 = ps_proc->pi2_inter_pred_tmp_buf2;
+ pu1_dst_luma = ps_proc->pu1_cur_pic_luma;
+ pu1_dst_chroma = ps_proc->pu1_cur_pic_chroma;
+
+ chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+ ASSERT(PSLICE == ps_slice_hdr->i1_slice_type || BSLICE == ps_slice_hdr->i1_slice_type);
+
+ ref_pic_luma_l0 = NULL;
+ ref_pic_chroma_l0 = NULL;
+
+ luma_weight_l0 = 0;
+ chroma_weight_l0_cb = 0;
+ chroma_weight_l0_cr = 0;
+
+ luma_offset_l0 = 0;
+ chroma_offset_l0_cb = 0;
+ chroma_offset_l0_cr = 0;
+
+ ref_pic_luma_l1 = NULL;
+ ref_pic_chroma_l1 = NULL;
+
+ luma_weight_l1 = 0;
+ chroma_weight_l1_cb = 0;
+ chroma_weight_l1_cr = 0;
+
+ luma_offset_l1 = 0;
+ chroma_offset_l1_cb = 0;
+ chroma_offset_l1_cr = 0;
+
+ for(pu_indx = 0; pu_indx < i4_pu_cnt; pu_indx++, ps_pu++)
+ {
+ /* If the PU is intra then proceed to the next */
+ if(1 == ps_pu->b1_intra_flag)
+ continue;
+ pu_x = (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size) + (ps_pu->b4_pos_x << 2);
+ pu_y = (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size) + (ps_pu->b4_pos_y << 2);
+
+ pu_wd = (ps_pu->b4_wd + 1) << 2;
+ pu_ht = (ps_pu->b4_ht + 1) << 2;
+
+ weighted_pred = (ps_slice_hdr->i1_slice_type == PSLICE) ? ps_pps->i1_weighted_pred_flag :
+ ps_pps->i1_weighted_bipred_flag;
+ bi_pred = (ps_pu->b2_pred_mode == PRED_BI);
+
+#ifdef GPU_BUILD
+ if(ps_proc->u4_gpu_inter_flag == 1)
+ {
+ /* Only 16x16 PUs have been implemented on opencl device */
+ if((pu_wd % 16 == 0) && (pu_ht % 16 == 0) && (weighted_pred == 0))
+ {
+ //printf("Skipping Inter\n");
+ continue;
+ }
+ }
+#endif
+ if(ps_pu->b2_pred_mode != PRED_L1)
+ {
+ pic_buf_t *ps_pic_buf_l0;
+
+ ps_pic_buf_l0 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list0[ps_pu->mv.i1_l0_ref_idx].pv_pic_buf));
+
+ ref_pic_luma_l0 = ps_pic_buf_l0->pu1_luma;
+ ref_pic_chroma_l0 = ps_pic_buf_l0->pu1_chroma;
+
+ luma_weight_l0 = ps_slice_hdr->s_wt_ofst.i2_luma_weight_l0[ps_pu->mv.i1_l0_ref_idx];
+ chroma_weight_l0_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l0_cb[ps_pu->mv.i1_l0_ref_idx];
+ chroma_weight_l0_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l0_cr[ps_pu->mv.i1_l0_ref_idx];
+
+ luma_offset_l0 = ps_slice_hdr->s_wt_ofst.i2_luma_offset_l0[ps_pu->mv.i1_l0_ref_idx];
+ chroma_offset_l0_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l0_cb[ps_pu->mv.i1_l0_ref_idx];
+ chroma_offset_l0_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l0_cr[ps_pu->mv.i1_l0_ref_idx];
+ }
+
+ if(ps_pu->b2_pred_mode != PRED_L0)
+ {
+ pic_buf_t *ps_pic_buf_l1;
+ ps_pic_buf_l1 = (pic_buf_t *)((ps_slice_hdr->as_ref_pic_list1[ps_pu->mv.i1_l1_ref_idx].pv_pic_buf));
+ ref_pic_luma_l1 = ps_pic_buf_l1->pu1_luma;
+ ref_pic_chroma_l1 = ps_pic_buf_l1->pu1_chroma;
+
+ luma_weight_l1 = ps_slice_hdr->s_wt_ofst.i2_luma_weight_l1[ps_pu->mv.i1_l1_ref_idx];
+ chroma_weight_l1_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l1_cb[ps_pu->mv.i1_l1_ref_idx];
+ chroma_weight_l1_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_weight_l1_cr[ps_pu->mv.i1_l1_ref_idx];
+
+ luma_offset_l1 = ps_slice_hdr->s_wt_ofst.i2_luma_offset_l1[ps_pu->mv.i1_l1_ref_idx];
+ chroma_offset_l1_cb = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l1_cb[ps_pu->mv.i1_l1_ref_idx];
+ chroma_offset_l1_cr = ps_slice_hdr->s_wt_ofst.i2_chroma_offset_l1_cr[ps_pu->mv.i1_l1_ref_idx];
+ }
+
+ /*luma and chroma components*/
+ for(clr_indx = 0; clr_indx < 2; clr_indx++)
+ {
+ PROFILE_DISABLE_INTER_PRED_LUMA(clr_indx);
+ PROFILE_DISABLE_INTER_PRED_CHROMA(clr_indx);
+
+ if(clr_indx == 0)
+ {
+ WORD32 mv;
+ if(ps_pu->b2_pred_mode != PRED_L1)
+ {
+ mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+ ai2_xint[0] = pu_x + (mv >> 2);
+ ai2_xfrac[0] = mv & 3;
+
+ mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+ ai2_yint[0] = pu_y + (mv >> 2);
+ ai2_yfrac[0] = mv & 3;
+
+ ai2_xfrac[0] &= ps_codec->i4_mv_frac_mask;
+ ai2_yfrac[0] &= ps_codec->i4_mv_frac_mask;
+
+
+ ref_pic_l0 = ref_pic_luma_l0 + ai2_yint[0] * ref_strd
+ + ai2_xint[0];
+ }
+
+ if(ps_pu->b2_pred_mode != PRED_L0)
+ {
+
+ mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+ ai2_xint[1] = pu_x + (mv >> 2);
+ ai2_xfrac[1] = mv & 3;
+
+ mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+ ai2_yint[1] = pu_y + (mv >> 2);
+ ai2_yfrac[1] = mv & 3;
+
+ ref_pic_l1 = ref_pic_luma_l1 + ai2_yint[1] * ref_strd
+ + ai2_xint[1];
+ ai2_xfrac[1] &= ps_codec->i4_mv_frac_mask;
+ ai2_yfrac[1] &= ps_codec->i4_mv_frac_mask;
+
+ }
+
+ pu1_dst = pu1_dst_luma + pu_y * ref_strd + pu_x;
+
+ ntaps = NTAPS_LUMA;
+ coeff = luma_filter;
+ }
+
+ else
+ {
+ WORD32 mv;
+ /* xint is upshifted by 1 because the chroma components are */
+ /* interleaved which is not the assumption made by standard */
+ if(ps_pu->b2_pred_mode != PRED_L1)
+ {
+ mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+ ai2_xint[0] = (pu_x / 2 + (mv >> 3)) << 1;
+ ai2_xfrac[0] = mv & 7;
+
+ mv = CLIP3(ps_pu->mv.s_l0_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+ ai2_yint[0] = pu_y / 2 + (mv >> 3);
+ ai2_yfrac[0] = mv & 7;
+
+ ref_pic_l0 = ref_pic_chroma_l0 + ai2_yint[0] * ref_strd
+ + ai2_xint[0];
+
+ ai2_xfrac[0] &= ps_codec->i4_mv_frac_mask;
+ ai2_yfrac[0] &= ps_codec->i4_mv_frac_mask;
+
+ }
+
+ if(ps_pu->b2_pred_mode != PRED_L0)
+ {
+ mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvx, (-((MAX_CTB_SIZE + pu_x + 7) << 2)), ((ps_sps->i2_pic_width_in_luma_samples - pu_x + 7) << 2));
+ ai2_xint[1] = (pu_x / 2 + (mv >> 3)) << 1;
+ ai2_xfrac[1] = mv & 7;
+
+ mv = CLIP3(ps_pu->mv.s_l1_mv.i2_mvy, (-((MAX_CTB_SIZE + pu_y + 7) << 2)), ((ps_sps->i2_pic_height_in_luma_samples - pu_y + 7) << 2));
+ ai2_yint[1] = pu_y / 2 + (mv >> 3);
+ ai2_yfrac[1] = mv & 7;
+
+ ref_pic_l1 = ref_pic_chroma_l1 + ai2_yint[1] * ref_strd
+ + ai2_xint[1];
+ ai2_xfrac[1] &= ps_codec->i4_mv_frac_mask;
+ ai2_yfrac[1] &= ps_codec->i4_mv_frac_mask;
+
+ }
+
+ pu1_dst = pu1_dst_chroma + pu_y * ref_strd / 2 + pu_x;
+
+ ntaps = NTAPS_CHROMA;
+ coeff = chroma_filter;
+ }
+
+ if(ps_pu->b2_pred_mode != PRED_L1)
+ {
+ func_indx1 = 4 * (weighted_pred || bi_pred) + 1 + 11 * clr_indx;
+ func_indx1 += ai2_xfrac[0] ? 2 : 0;
+ func_indx1 += ai2_yfrac[0] ? 1 : 0;
+
+ func_indx2 = (ai2_xfrac[0] && ai2_yfrac[0])
+ * (9 + (weighted_pred || bi_pred)) + 11 * clr_indx;
+
+ func_ptr1 = ps_codec->apf_inter_pred[func_indx1];
+ func_ptr2 = ps_codec->apf_inter_pred[func_indx2];
+ }
+ else
+ {
+ func_ptr1 = NULL;
+ func_ptr2 = NULL;
+ }
+ if(ps_pu->b2_pred_mode != PRED_L0)
+ {
+ func_indx3 = 4 * (weighted_pred || bi_pred) + 1 + 11 * clr_indx;
+ func_indx3 += ai2_xfrac[1] ? 2 : 0;
+ func_indx3 += ai2_yfrac[1] ? 1 : 0;
+
+ func_indx4 = (ai2_xfrac[1] && ai2_yfrac[1])
+ * (9 + (weighted_pred || bi_pred)) + 11 * clr_indx;
+
+ func_ptr3 = ps_codec->apf_inter_pred[func_indx3];
+ func_ptr4 = ps_codec->apf_inter_pred[func_indx4];
+ }
+ else
+ {
+ func_ptr3 = NULL;
+ func_ptr4 = NULL;
+ }
+
+ /*Function 1*/
+ if(func_ptr1 != NULL)
+ {
+ func_src_strd = ref_strd;
+ func_src = (ai2_xfrac[0] && ai2_yfrac[0]) ?
+ ref_pic_l0 - (ntaps / 2 - 1) * func_src_strd :
+ ref_pic_l0;
+ func_dst = (weighted_pred || bi_pred) ?
+ (void *)pi2_tmp1 : (void *)pu1_dst;
+ if(ai2_xfrac[0] && ai2_yfrac[0])
+ {
+ func_dst = pi2_tmp1;
+ }
+
+ func_dst_strd = (weighted_pred || bi_pred
+ || (ai2_xfrac[0] && ai2_yfrac[0])) ?
+ pu_wd : ref_strd;
+ func_coeff = ai2_xfrac[0] ?
+ coeff[ai2_xfrac[0]] : coeff[ai2_yfrac[0]];
+ func_wd = pu_wd >> clr_indx;
+ func_ht = pu_ht >> clr_indx;
+ func_ht += (ai2_xfrac[0] && ai2_yfrac[0]) ? ntaps - 1 : 0;
+ func_ptr1(func_src, func_dst, func_src_strd, func_dst_strd,
+ func_coeff, func_ht, func_wd);
+ }
+
+ /*Function 2*/
+ if(func_ptr2 != NULL)
+ {
+ func_src_strd = pu_wd;
+ func_src = pi2_tmp1 + (ntaps / 2 - 1) * func_src_strd;
+ func_dst = (weighted_pred || bi_pred) ?
+ (void *)pi2_tmp1 : (void *)pu1_dst;
+
+ func_dst_strd = (weighted_pred || bi_pred) ?
+ pu_wd : ref_strd;
+ func_coeff = coeff[ai2_yfrac[0]];
+ func_wd = pu_wd >> clr_indx;
+ func_ht = pu_ht >> clr_indx;
+ func_ptr2(func_src, func_dst, func_src_strd, func_dst_strd,
+ func_coeff, func_ht, func_wd);
+ }
+
+ if(func_ptr3 != NULL)
+ {
+ func_src_strd = ref_strd;
+ func_src = (ai2_xfrac[1] && ai2_yfrac[1]) ?
+ ref_pic_l1 - (ntaps / 2 - 1) * func_src_strd :
+ ref_pic_l1;
+
+ func_dst = (weighted_pred || bi_pred) ?
+ (void *)pi2_tmp2 : (void *)pu1_dst;
+ if(ai2_xfrac[1] && ai2_yfrac[1])
+ {
+ func_dst = pi2_tmp2;
+ }
+ func_dst_strd = (weighted_pred || bi_pred
+ || (ai2_xfrac[1] && ai2_yfrac[1])) ?
+ pu_wd : ref_strd;
+ func_coeff = ai2_xfrac[1] ?
+ coeff[ai2_xfrac[1]] : coeff[ai2_yfrac[1]];
+ func_wd = pu_wd >> clr_indx;
+ func_ht = pu_ht >> clr_indx;
+ func_ht += (ai2_xfrac[1] && ai2_yfrac[1]) ? ntaps - 1 : 0;
+ func_ptr3(func_src, func_dst, func_src_strd, func_dst_strd,
+ func_coeff, func_ht, func_wd);
+
+ }
+
+ if(func_ptr4 != NULL)
+ {
+ func_src_strd = pu_wd;
+ func_src = pi2_tmp2 + (ntaps / 2 - 1) * func_src_strd;
+
+ func_dst = (weighted_pred || bi_pred) ?
+ (void *)pi2_tmp2 : (void *)pu1_dst;
+ func_dst_strd = (weighted_pred || bi_pred) ?
+ pu_wd : ref_strd;
+ func_coeff = coeff[ai2_yfrac[1]];
+ func_wd = pu_wd >> clr_indx;
+ func_ht = pu_ht >> clr_indx;
+ func_ptr4(func_src, func_dst, func_src_strd, func_dst_strd,
+ func_coeff, func_ht, func_wd);
+
+ }
+
+ PROFILE_DISABLE_INTER_PRED_LUMA_AVERAGING(clr_indx);
+ PROFILE_DISABLE_INTER_PRED_CHROMA_AVERAGING(clr_indx);
+
+
+ if((weighted_pred != 0) && (bi_pred != 0))
+ {
+ lvl_shift1 = 0;
+ lvl_shift2 = 0;
+ if((0 == clr_indx) && (ai2_xfrac[0] && ai2_yfrac[0]))
+ lvl_shift1 = (1 << 13);
+
+ if((0 == clr_indx) && (ai2_xfrac[1] && ai2_yfrac[1]))
+ lvl_shift2 = (1 << 13);
+
+
+ if(0 == clr_indx)
+ {
+ shift = ps_slice_hdr->s_wt_ofst.i1_luma_log2_weight_denom
+ + SHIFT_14_MINUS_BIT_DEPTH + 1;
+
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr(pi2_tmp1,
+ pi2_tmp2,
+ pu1_dst,
+ pu_wd,
+ pu_wd,
+ ref_strd,
+ luma_weight_l0,
+ luma_offset_l0,
+ luma_weight_l1,
+ luma_offset_l1,
+ shift,
+ lvl_shift1,
+ lvl_shift2,
+ pu_ht,
+ pu_wd);
+ }
+ else
+ {
+ shift = ps_slice_hdr->s_wt_ofst.i1_chroma_log2_weight_denom
+ + SHIFT_14_MINUS_BIT_DEPTH + 1;
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr(pi2_tmp1,
+ pi2_tmp2,
+ pu1_dst,
+ pu_wd,
+ pu_wd,
+ ref_strd,
+ chroma_weight_l0_cr,
+ chroma_weight_l0_cb,
+ chroma_offset_l0_cr,
+ chroma_offset_l0_cb,
+ chroma_weight_l1_cr,
+ chroma_weight_l1_cb,
+ chroma_offset_l1_cr,
+ chroma_offset_l1_cb,
+ shift,
+ lvl_shift1,
+ lvl_shift2,
+ pu_ht >> 1,
+ pu_wd >> 1);
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr(pi2_tmp1,
+ pi2_tmp2,
+ pu1_dst,
+ pu_wd,
+ pu_wd,
+ ref_strd,
+ chroma_weight_l0_cb,
+ chroma_weight_l0_cr,
+ chroma_offset_l0_cb,
+ chroma_offset_l0_cr,
+ chroma_weight_l1_cb,
+ chroma_weight_l1_cr,
+ chroma_offset_l1_cb,
+ chroma_offset_l1_cr,
+ shift,
+ lvl_shift1,
+ lvl_shift2,
+ pu_ht >> 1,
+ pu_wd >> 1);
+ }
+ }
+ }
+
+ else if((weighted_pred != 0) && (bi_pred == 0))
+ {
+ lvl_shift1 = 0;
+ if(ps_pu->b2_pred_mode == PRED_L0)
+ {
+ if((0 == clr_indx) && (ai2_xfrac[0] && ai2_yfrac[0]))
+ lvl_shift1 = (1 << 13);
+ }
+ else
+ {
+ if((0 == clr_indx) && (ai2_xfrac[1] && ai2_yfrac[1]))
+ lvl_shift1 = (1 << 13);
+ }
+
+ if(0 == clr_indx)
+ {
+ shift = ps_slice_hdr->s_wt_ofst.i1_luma_log2_weight_denom
+ + SHIFT_14_MINUS_BIT_DEPTH;
+
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr(ps_pu->b2_pred_mode == PRED_L0 ? pi2_tmp1 : pi2_tmp2,
+ pu1_dst,
+ pu_wd,
+ ref_strd,
+ ps_pu->b2_pred_mode == PRED_L0 ? luma_weight_l0 : luma_weight_l1,
+ ps_pu->b2_pred_mode == PRED_L0 ? luma_offset_l0 : luma_offset_l1,
+ shift,
+ lvl_shift1,
+ pu_ht,
+ pu_wd);
+ }
+ else
+ {
+ shift = ps_slice_hdr->s_wt_ofst.i1_chroma_log2_weight_denom
+ + SHIFT_14_MINUS_BIT_DEPTH;
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr(ps_pu->b2_pred_mode == PRED_L0 ? pi2_tmp1 : pi2_tmp2,
+ pu1_dst,
+ pu_wd,
+ ref_strd,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cr : chroma_weight_l1_cr,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cb : chroma_weight_l1_cb,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cr : chroma_offset_l1_cr,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cb : chroma_offset_l1_cb,
+ shift,
+ lvl_shift1,
+ pu_ht >> 1,
+ pu_wd >> 1);
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr(ps_pu->b2_pred_mode == PRED_L0 ? pi2_tmp1 : pi2_tmp2,
+ pu1_dst,
+ pu_wd,
+ ref_strd,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cb : chroma_weight_l1_cb,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_weight_l0_cr : chroma_weight_l1_cr,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cb : chroma_offset_l1_cb,
+ ps_pu->b2_pred_mode == PRED_L0 ? chroma_offset_l0_cr : chroma_offset_l1_cr,
+ shift,
+ lvl_shift1,
+ pu_ht >> 1,
+ pu_wd >> 1);
+ }
+ }
+ }
+
+ else if((weighted_pred == 0) && (bi_pred != 0))
+ {
+ lvl_shift1 = 0;
+ lvl_shift2 = 0;
+ if((0 == clr_indx) && (ai2_xfrac[0] && ai2_yfrac[0]))
+ lvl_shift1 = (1 << 13);
+
+ if((0 == clr_indx) && (ai2_xfrac[1] && ai2_yfrac[1]))
+ lvl_shift2 = (1 << 13);
+
+ if(clr_indx != 0)
+ {
+ pu_ht = (pu_ht >> 1);
+ }
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr(pi2_tmp1,
+ pi2_tmp2,
+ pu1_dst,
+ pu_wd,
+ pu_wd,
+ ref_strd,
+ lvl_shift1,
+ lvl_shift2,
+ pu_ht,
+ pu_wd);
+
+ }
+ }
+ }
+}
diff --git a/decoder/ihevcd_inter_pred.h b/decoder/ihevcd_inter_pred.h
new file mode 100644
index 0000000..0c510d2
--- /dev/null
+++ b/decoder/ihevcd_inter_pred.h
@@ -0,0 +1,43 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_slice.h
+*
+* @brief
+* Processing of slice level data
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef IHEVCD_INTER_PRED_H_
+#define IHEVCD_INTER_PRED_H_
+
+void ihevcd_inter_pred_ctb(process_ctxt_t *ps_proc);
+
+
+#endif /* IHEVCD_INTER_PRED_H_ */
diff --git a/decoder/ihevcd_intra_pred_mode_prediction.c b/decoder/ihevcd_intra_pred_mode_prediction.c
new file mode 100644
index 0000000..b5936c4
--- /dev/null
+++ b/decoder/ihevcd_intra_pred_mode_prediction.c
@@ -0,0 +1,323 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_intra_pred_mode_prediction.c.c
+ *
+ * @brief
+ * Contains functions for intra pred mode prediction
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_intra_pred_mode_prediction()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevcd_defs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+
+#include "ihevcd_bitstream.h"
+
+
+/*****************************************************************************/
+/* Function Prototypes */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Availability check is not done inside the function */
+/* Whenever the top and left are not available, it is assumed that Intra DC */
+/* mode will initialized in place of non available */
+/* neighbors */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief Computes intra prediction mode for a CU
+*
+* @par Description
+* Computes intra prediction mode for a CU
+*
+* @param[in,out] ps_cu
+* Codic unit context
+*
+* @param[in] ps_parse
+* parse context
+*
+* @param[in] ps_codec
+* codec context
+*
+* @param[in] log2_cb_size
+* log of cb size base 2
+*
+* @returns none
+*
+* @remarks
+* Availability check is moved to CTB level. If the neighbors are
+* not available or if the pred mode of neighbor is not MODE_INTRA,
+* INTRA_DC mode will be updated in top and left buffers.
+*******************************************************************************
+*/
+void ihevcd_intra_pred_mode_prediction(codec_t *ps_codec,
+ WORD32 log2_cb_size,
+ WORD32 x0,
+ WORD32 y0)
+{
+ WORD32 i, j, num_pred_blocks;
+ WORD32 available_l, available_t;
+ WORD32 cand_intra_pred_mode_l, cand_intra_pred_mode_t;
+ WORD32 cand_mode_list[3];
+ WORD32 cb_size, block_offset_in_min_pu;
+ UWORD8 *pu1_luma_intra_pred_mode_top;
+ UWORD8 *pu1_luma_intra_pred_mode_left;
+
+ parse_ctxt_t *ps_parse = &ps_codec->s_parse;
+ parse_cu_t *ps_cu = &ps_codec->s_parse.s_cu;
+ sps_t *ps_sps = ps_parse->ps_sps;
+
+
+ available_t = 1;
+ available_l = 1;
+ /* i4_pos_x and i4_pos_y are in minCu units (8x8), convert them to 4x4 units by multiplying by 2 */
+ pu1_luma_intra_pred_mode_top = ps_parse->pu1_luma_intra_pred_mode_top
+ + (ps_cu->i4_pos_x * 2);
+
+ pu1_luma_intra_pred_mode_left = ps_parse->pu1_luma_intra_pred_mode_left
+ + (ps_cu->i4_pos_y * 2);
+
+/*
+ if(0 == ps_cu->i4_pos_y)
+ {
+ memset(pu1_luma_intra_pred_mode_top, INTRA_DC, 16);
+ }
+
+ if(0 == ps_cu->i4_pos_x)
+ {
+ memset(pu1_luma_intra_pred_mode_left, INTRA_DC, 16);
+ }
+*/
+ if(ps_cu->i4_pos_y)
+ {
+ UWORD8 *pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+ WORD32 top_intra_flag;
+
+ WORD32 numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+ pu1_pic_intra_flag += ((y0 - 8) / 8) * numbytes_row;
+ pu1_pic_intra_flag += (x0 / 64);
+ top_intra_flag = *pu1_pic_intra_flag;
+ top_intra_flag &= (1 << ((x0 / 8) % 8));
+
+ if(0 == top_intra_flag)
+ {
+ available_t = 0;
+ }
+ }
+ else
+ available_t = 0;
+
+
+ if((0 == ps_cu->i4_pos_x) && (((0 == ps_codec->s_parse.i4_ctb_slice_x) && (0 == ps_codec->s_parse.i4_ctb_slice_y)) ||
+ (0 == ps_codec->s_parse.i4_ctb_tile_x)))
+ {
+ available_l = 0;
+ }
+
+ if(available_l)
+ {
+ UWORD8 *pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+ WORD32 left_intra_flag;
+ WORD32 numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+ pu1_pic_intra_flag += (y0 / 8) * numbytes_row;
+ pu1_pic_intra_flag += ((x0 - 8) / 64);
+ left_intra_flag = *pu1_pic_intra_flag;
+ left_intra_flag &= (1 << (((x0 - 8) / 8) % 8));
+
+ if(0 == left_intra_flag)
+ {
+ available_l = 0;
+ }
+ }
+
+ cb_size = (1 << log2_cb_size);
+
+ block_offset_in_min_pu = (cb_size / 2) / MIN_PU_SIZE;
+
+ num_pred_blocks = (ps_cu->i4_part_mode == PART_NxN) ? 2 : 1;
+
+ for(i = 0; i < num_pred_blocks; i++)
+ {
+ WORD32 available_l_tmp;
+ available_l_tmp = available_l;
+ for(j = 0; j < num_pred_blocks; j++)
+ {
+ /* Computing Candidate intra pred mode left */
+ {
+ WORD32 block_offset;
+
+ block_offset = i * block_offset_in_min_pu;
+ cand_intra_pred_mode_l = INTRA_DC;
+ if(available_l_tmp)
+ {
+ cand_intra_pred_mode_l =
+ pu1_luma_intra_pred_mode_left[block_offset];
+ }
+
+ }
+
+ {
+ WORD32 block_offset;
+ block_offset = j * block_offset_in_min_pu;
+ cand_intra_pred_mode_t = INTRA_DC;
+ if(available_t)
+ {
+ cand_intra_pred_mode_t =
+ pu1_luma_intra_pred_mode_top[block_offset];
+ }
+ }
+
+ /* Computing Candidate mode list */
+ if(cand_intra_pred_mode_l == cand_intra_pred_mode_t)
+ {
+ if(cand_intra_pred_mode_l < 2)
+ {
+ cand_mode_list[0] = INTRA_PLANAR;
+ cand_mode_list[1] = INTRA_DC;
+ cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
+ }
+ else
+ {
+ cand_mode_list[0] = cand_intra_pred_mode_l;
+ cand_mode_list[1] = 2
+ + ((cand_intra_pred_mode_l + 29) % 32);
+ cand_mode_list[2] = 2
+ + ((cand_intra_pred_mode_l - 2 + 1) % 32);
+ }
+ }
+ else
+ {
+ cand_mode_list[0] = cand_intra_pred_mode_l;
+ cand_mode_list[1] = cand_intra_pred_mode_t;
+
+ if((cand_intra_pred_mode_l != INTRA_PLANAR)
+ && (cand_intra_pred_mode_t != INTRA_PLANAR))
+ {
+ cand_mode_list[2] = INTRA_PLANAR;
+ }
+ else if((cand_intra_pred_mode_l != INTRA_DC)
+ && (cand_intra_pred_mode_t != INTRA_DC))
+ {
+ cand_mode_list[2] = INTRA_DC;
+ }
+ else
+ {
+ cand_mode_list[2] = INTRA_ANGULAR(26);
+ }
+ }
+
+ /* Computing Intra pred mode */
+ if(ps_cu->ai4_prev_intra_luma_pred_flag[2 * i + j] == 1)
+ {
+ ps_cu->ai4_intra_luma_pred_mode[2 * i + j] =
+ cand_mode_list[ps_cu->ai4_mpm_idx[2 * i + j]];
+ }
+ else
+ {
+ WORD32 intra_pred_mode;
+ /* Arranging cand_mode_list in increasing order */
+ if(cand_mode_list[0] > cand_mode_list[1])
+ {
+ SWAP(cand_mode_list[0], cand_mode_list[1]);
+ }
+ if(cand_mode_list[0] > cand_mode_list[2])
+ {
+ SWAP(cand_mode_list[0], cand_mode_list[2]);
+ }
+ if(cand_mode_list[1] > cand_mode_list[2])
+ {
+ SWAP(cand_mode_list[1], cand_mode_list[2]);
+ }
+
+ intra_pred_mode = ps_cu->ai4_rem_intra_luma_pred_mode[2 * i + j];
+
+ if(intra_pred_mode >= cand_mode_list[0])
+ intra_pred_mode++;
+
+ if(intra_pred_mode >= cand_mode_list[1])
+ intra_pred_mode++;
+
+ if(intra_pred_mode >= cand_mode_list[2])
+ intra_pred_mode++;
+
+ ps_cu->ai4_intra_luma_pred_mode[2 * i + j] = intra_pred_mode;
+ }
+ /* Update Top and Left intra pred mode */
+ {
+ WORD32 intra_pred_mode;
+
+ intra_pred_mode = ps_cu->ai4_intra_luma_pred_mode[2 * i + j];
+
+ ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_left + i * block_offset_in_min_pu, intra_pred_mode, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+ ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_top + j * block_offset_in_min_pu, intra_pred_mode, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+
+ }
+ /* If partition is PART_NxN, then left is available for second column always */
+ available_l_tmp = 1;
+
+ }
+ /* If partition is PART_NxN, then top is available for bottom row always */
+ available_t = 1;
+ }
+
+ /* In case it is PART_2Nx2N partition, replicate intra pred mode in other three entries */
+ if(ps_cu->i4_part_mode == PART_2Nx2N)
+ {
+ ps_cu->ai4_intra_luma_pred_mode[1] = ps_cu->ai4_intra_luma_pred_mode[0];
+ ps_cu->ai4_intra_luma_pred_mode[2] = ps_cu->ai4_intra_luma_pred_mode[0];
+ ps_cu->ai4_intra_luma_pred_mode[3] = ps_cu->ai4_intra_luma_pred_mode[0];
+ }
+}
+
diff --git a/decoder/ihevcd_intra_pred_mode_prediction.h b/decoder/ihevcd_intra_pred_mode_prediction.h
new file mode 100644
index 0000000..683a7f4
--- /dev/null
+++ b/decoder/ihevcd_intra_pred_mode_prediction.h
@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_intra_pred_mode_prediction.h
+ *
+ * @brief
+ * Contains functions for intra pred mode prediction
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_intra_pred_mode_prediction()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_INTRA_PRED_MODE_PREDICTION_H_
+#define _IHEVCD_INTRA_PRED_MODE_PREDICTION_H_
+void ihevcd_intra_pred_mode_prediction(codec_t *ps_codec,
+ WORD32 log2_cb_size,
+ WORD32 x0,
+ WORD32 y0);
+
+#endif /* _IHEVCD_INTRA_PRED_MODE_PREDICTION_H_ */
diff --git a/decoder/ihevcd_iquant_itrans_recon_ctb.c b/decoder/ihevcd_iquant_itrans_recon_ctb.c
new file mode 100644
index 0000000..1596660
--- /dev/null
+++ b/decoder/ihevcd_iquant_itrans_recon_ctb.c
@@ -0,0 +1,1273 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_iquant_itrans_recon_ctb.c
+ *
+ * @brief
+ * Contains functions for inverse quantization, inverse transform and recon
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_iquant_itrans_recon_ctb()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+#include "ihevc_common_tables.h"
+
+/* Intra pred includes */
+#include "ihevc_intra_pred.h"
+
+/* Inverse transform common module includes */
+#include "ihevc_trans_tables.h"
+#include "ihevc_trans_macros.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_chroma_recon.h"
+
+/* Decoder includes */
+#include "ihevcd_common_tables.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_statistics.h"
+#include "ihevcd_itrans_recon_dc.h"
+
+const UWORD32 gau4_ihevcd_4_bit_reverse[] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+
+
+/* Globals */
+WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES] =
+ { IP_FUNC_MODE_0, /* Mode 0 */
+ IP_FUNC_MODE_1, /* Mode 1 */
+ IP_FUNC_MODE_2, /* Mode 2 */
+ IP_FUNC_MODE_3TO9, /* Mode 3 */
+ IP_FUNC_MODE_3TO9, /* Mode 4 */
+ IP_FUNC_MODE_3TO9, /* Mode 5 */
+ IP_FUNC_MODE_3TO9, /* Mode 6 */
+ IP_FUNC_MODE_3TO9, /* Mode 7 */
+ IP_FUNC_MODE_3TO9, /* Mode 8 */
+ IP_FUNC_MODE_3TO9, /* Mode 9 */
+ IP_FUNC_MODE_10, /* Mode 10 */
+ IP_FUNC_MODE_11TO17, /* Mode 11 */
+ IP_FUNC_MODE_11TO17, /* Mode 12 */
+ IP_FUNC_MODE_11TO17, /* Mode 13 */
+ IP_FUNC_MODE_11TO17, /* Mode 14 */
+ IP_FUNC_MODE_11TO17, /* Mode 15 */
+ IP_FUNC_MODE_11TO17, /* Mode 16 */
+ IP_FUNC_MODE_11TO17, /* Mode 17 */
+ IP_FUNC_MODE_18_34, /* Mode 18 */
+ IP_FUNC_MODE_19TO25, /* Mode 19 */
+ IP_FUNC_MODE_19TO25, /* Mode 20 */
+ IP_FUNC_MODE_19TO25, /* Mode 21 */
+ IP_FUNC_MODE_19TO25, /* Mode 22 */
+ IP_FUNC_MODE_19TO25, /* Mode 23 */
+ IP_FUNC_MODE_19TO25, /* Mode 24 */
+ IP_FUNC_MODE_19TO25, /* Mode 25 */
+ IP_FUNC_MODE_26, /* Mode 26 */
+ IP_FUNC_MODE_27TO33, /* Mode 27 */
+ IP_FUNC_MODE_27TO33, /* Mode 26 */
+ IP_FUNC_MODE_27TO33, /* Mode 29 */
+ IP_FUNC_MODE_27TO33, /* Mode 30 */
+ IP_FUNC_MODE_27TO33, /* Mode 31 */
+ IP_FUNC_MODE_27TO33, /* Mode 32 */
+ IP_FUNC_MODE_27TO33, /* Mode 33 */
+ IP_FUNC_MODE_18_34, /* Mode 34 */
+};
+
+
+const WORD16 *g_ai2_ihevc_trans_tables[] =
+ { &g_ai2_ihevc_trans_dst_4[0][0],
+ &g_ai2_ihevc_trans_4[0][0],
+ &g_ai2_ihevc_trans_8[0][0],
+ &g_ai2_ihevc_trans_16[0][0],
+ &g_ai2_ihevc_trans_32[0][0]
+};
+
+
+/*****************************************************************************/
+/* Function Prototypes */
+/*****************************************************************************/
+/* Returns number of ai2_level read from ps_sblk_coeff */
+UWORD8* ihevcd_unpack_coeffs(WORD16 *pi2_tu_coeff,
+ WORD32 log2_trans_size,
+ UWORD8 *pu1_tu_coeff_data,
+ WORD16 *pi2_dequant_matrix,
+ WORD32 qp_rem,
+ WORD32 qp_div,
+ TRANSFORM_TYPE e_trans_type,
+ WORD32 trans_quant_bypass,
+ UWORD32 *pu4_zero_cols,
+ UWORD32 *pu4_zero_rows,
+ UWORD32 *pu4_coeff_type,
+ WORD16 *pi2_coeff_value)
+{
+ /* Generating coeffs from coeff-map */
+ WORD32 i;
+ WORD16 *pi2_sblk_ptr;
+ WORD32 subblk_pos_x, subblk_pos_y;
+ WORD32 sblk_scan_idx, coeff_raster_idx;
+ WORD32 sblk_non_zero_coeff_idx;
+ tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+ UWORD8 u1_num_coded_sblks, u1_scan_type;
+ UWORD8 *pu1_new_tu_coeff_data;
+ WORD32 trans_size;
+ WORD32 xs, ys;
+ WORD32 trans_skip;
+ WORD16 iquant_out;
+ WORD32 shift_iq;
+ {
+ WORD32 bit_depth;
+
+ bit_depth = 8 + 0;
+ shift_iq = bit_depth + log2_trans_size - 5;
+ }
+ trans_size = (1 << log2_trans_size);
+
+ /* First byte points to number of coded blocks */
+ u1_num_coded_sblks = *pu1_tu_coeff_data++;
+
+ /* Next byte points to scan type */
+ u1_scan_type = *pu1_tu_coeff_data++;
+ /* 0th bit has trans_skip */
+ trans_skip = u1_scan_type & 1;
+ u1_scan_type >>= 1;
+
+ pi2_sblk_ptr = pi2_tu_coeff;
+
+ /* Initially all columns are assumed to be zero */
+ *pu4_zero_cols = 0xFFFFFFFF;
+ /* Initially all rows are assumed to be zero */
+ *pu4_zero_rows = 0xFFFFFFFF;
+
+ ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)(pu1_tu_coeff_data);
+
+ if(trans_skip)
+ memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
+
+ STATS_INIT_SBLK_AND_COEFF_POS();
+
+ /* DC only case */
+ if((e_trans_type != DST_4x4) && (1 == u1_num_coded_sblks)
+ && (0 == ps_tu_sblk_coeff_data->u2_subblk_pos)
+ && (1 == ps_tu_sblk_coeff_data->u2_sig_coeff_map))
+ {
+ *pu4_coeff_type = 1;
+
+ if(!trans_quant_bypass)
+ {
+ if(4 == trans_size)
+ {
+ IQUANT_4x4(iquant_out,
+ ps_tu_sblk_coeff_data->ai2_level[0],
+ pi2_dequant_matrix[0]
+ * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ }
+ else
+ {
+ IQUANT(iquant_out, ps_tu_sblk_coeff_data->ai2_level[0],
+ pi2_dequant_matrix[0] * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ }
+ if(trans_skip)
+ iquant_out = (iquant_out + 16) >> 5;
+ }
+ else
+ {
+ /* setting the column to zero */
+ for(i = 0; i < trans_size; i++)
+ *(pi2_tu_coeff + i * trans_size) = 0;
+
+ iquant_out = ps_tu_sblk_coeff_data->ai2_level[0];
+ }
+ *pi2_coeff_value = iquant_out;
+ *pi2_tu_coeff = iquant_out;
+ *pu4_zero_cols &= ~0x1;
+ *pu4_zero_rows &= ~0x1;
+ ps_tu_sblk_coeff_data =
+ (void *)&ps_tu_sblk_coeff_data->ai2_level[1];
+
+ STATS_UPDATE_COEFF_COUNT();
+ STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass), 0, 0);
+ STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
+ return ((UWORD8 *)ps_tu_sblk_coeff_data);
+ }
+ else
+ {
+ *pu4_coeff_type = 0;
+ /* In case of trans skip, memset has already happened */
+ if(!trans_skip)
+ memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
+ }
+
+ for(i = 0; i < u1_num_coded_sblks; i++)
+ {
+ UWORD32 u4_sig_coeff_map;
+ subblk_pos_x = ps_tu_sblk_coeff_data->u2_subblk_pos & 0x00FF;
+ subblk_pos_y = (ps_tu_sblk_coeff_data->u2_subblk_pos & 0xFF00) >> 8;
+
+ STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass), subblk_pos_x, subblk_pos_y);
+
+ subblk_pos_x = subblk_pos_x * MIN_TU_SIZE;
+ subblk_pos_y = subblk_pos_y * MIN_TU_SIZE;
+
+ pi2_sblk_ptr = pi2_tu_coeff + subblk_pos_y * trans_size
+ + subblk_pos_x;
+
+ //*pu4_zero_cols &= ~(0xF << subblk_pos_x);
+
+ sblk_non_zero_coeff_idx = 0;
+ u4_sig_coeff_map = ps_tu_sblk_coeff_data->u2_sig_coeff_map;
+ //for(sblk_scan_idx = (31 - CLZ(u4_sig_coeff_map)); sblk_scan_idx >= 0; sblk_scan_idx--)
+ sblk_scan_idx = 31;
+ do
+ {
+ WORD32 clz = CLZ(u4_sig_coeff_map);
+
+ sblk_scan_idx -= clz;
+ /* when clz is 31, u4_sig_coeff_map << (clz+1) might result in unknown behaviour in some cases */
+ /* Hence either use SHL which takes care of handling these issues based on platform or shift in two stages */
+ u4_sig_coeff_map = u4_sig_coeff_map << clz;
+ /* Copying coeffs and storing in reverse order */
+ {
+ STATS_UPDATE_COEFF_COUNT();
+ coeff_raster_idx =
+ gau1_ihevc_invscan4x4[u1_scan_type][sblk_scan_idx];
+
+ xs = coeff_raster_idx & 0x3;
+ ys = coeff_raster_idx >> 2;
+
+ if(!trans_quant_bypass)
+ {
+ if(4 == trans_size)
+ {
+ IQUANT_4x4(iquant_out,
+ ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
+ pi2_dequant_matrix[(subblk_pos_x + xs)
+ + (subblk_pos_y + ys)
+ * trans_size]
+ * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ sblk_non_zero_coeff_idx++;
+ }
+ else
+ {
+ IQUANT(iquant_out,
+ ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
+ pi2_dequant_matrix[(subblk_pos_x + xs)
+ + (subblk_pos_y + ys)
+ * trans_size]
+ * g_ihevc_iquant_scales[qp_rem],
+ shift_iq, qp_div);
+ sblk_non_zero_coeff_idx++;
+ }
+
+ if(trans_skip)
+ iquant_out = (iquant_out + 16) >> 5;
+ }
+ else
+ {
+ iquant_out = ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx++];
+ }
+ *pu4_zero_cols &= ~(0x1 << (subblk_pos_x + xs));
+ *pu4_zero_rows &= ~(0x1 << (subblk_pos_y + ys));
+ *(pi2_sblk_ptr + xs + ys * trans_size) = iquant_out;
+ }
+ sblk_scan_idx--;
+ u4_sig_coeff_map <<= 1;
+
+ }while(u4_sig_coeff_map);
+ /* Updating the sblk pointer */
+ ps_tu_sblk_coeff_data =
+ (void *)&ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx];
+ }
+
+ STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
+
+ pu1_new_tu_coeff_data = (UWORD8 *)ps_tu_sblk_coeff_data;
+
+ return pu1_new_tu_coeff_data;
+}
+
+WORD32 ihevcd_get_intra_nbr_flag(process_ctxt_t *ps_proc,
+ tu_t *ps_tu,
+ UWORD32 *pu4_intra_nbr_avail,
+ WORD16 i2_pic_width_in_luma_samples,
+ UWORD8 i1_constrained_intra_pred_flag,
+ WORD32 trans_size,
+ WORD32 ctb_size)
+{
+ sps_t *ps_sps;
+ UWORD8 u1_bot_lt_avail, u1_left_avail, u1_top_avail, u1_top_rt_avail,
+ u1_top_lt_avail;
+ WORD32 x_cur, y_cur, x_nbr, y_nbr;
+ UWORD8 *pu1_nbr_intra_flag;
+ UWORD8 *pu1_pic_intra_flag;
+ UWORD8 top_right, top, top_left, left, bot_left;
+ WORD32 intra_pos;
+ WORD32 num_8_blks, num_8_blks_in_bits;
+ WORD32 numbytes_row = (i2_pic_width_in_luma_samples + 63) / 64;
+ WORD32 cur_x, cur_y;
+ WORD32 i;
+ WORD32 nbr_flags;
+
+ ps_sps = ps_proc->ps_sps;
+ cur_x = ps_tu->b4_pos_x;
+ cur_y = ps_tu->b4_pos_y;
+
+ u1_bot_lt_avail = (pu4_intra_nbr_avail[1 + cur_y + trans_size / MIN_TU_SIZE]
+ >> (31 - (1 + cur_x - 1))) & 1;
+ u1_left_avail = (pu4_intra_nbr_avail[1 + cur_y] >> (31 - (1 + cur_x - 1)))
+ & 1;
+ u1_top_avail = (pu4_intra_nbr_avail[1 + cur_y - 1] >> (31 - (1 + cur_x)))
+ & 1;
+ u1_top_rt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
+ >> (31 - (1 + cur_x + trans_size / MIN_TU_SIZE))) & 1;
+ u1_top_lt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
+ >> (31 - (1 + cur_x - 1))) & 1;
+
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf(" Before constrained intra pred. BL:%d,L:%d,T:%d,TR:%d,TL:%d\n", u1_bot_lt_avail, u1_left_avail, u1_top_avail, u1_top_rt_avail, u1_top_lt_avail);
+#endif
+ x_cur = ps_proc->i4_ctb_x * ctb_size + cur_x * MIN_TU_SIZE;
+ y_cur = ps_proc->i4_ctb_y * ctb_size + cur_y * MIN_TU_SIZE;
+
+ pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
+
+ /* WORD32 nbr_flags as below MSB --> LSB */
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ * 1 4 4 4 4
+ */
+ bot_left = 0;
+ left = 0;
+ top_right = 0;
+ top = 0;
+ top_left = 0;
+
+ num_8_blks = trans_size > 4 ? trans_size / 8 : 1;
+ num_8_blks_in_bits = ((1 << num_8_blks) - 1);
+
+ if(i1_constrained_intra_pred_flag)
+ {
+ /* TODO: constrained intra pred not tested */
+ if(u1_bot_lt_avail)
+ {
+ x_nbr = x_cur - 1;
+ y_nbr = y_cur + trans_size;
+
+ pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+ + x_nbr / 64;
+ intra_pos = ((x_nbr / 8) % 8);
+ for(i = 0; i < num_8_blks; i++)
+ {
+ bot_left |= ((*(pu1_nbr_intra_flag + i * numbytes_row)
+ >> intra_pos) & 1) << i;
+ }
+ bot_left &= num_8_blks_in_bits;
+ }
+ if(u1_left_avail)
+ {
+ x_nbr = x_cur - 1;
+ y_nbr = y_cur;
+
+ pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+ + x_nbr / 64;
+ intra_pos = ((x_nbr / 8) % 8);
+
+ for(i = 0; i < num_8_blks; i++)
+ {
+ left |= ((*(pu1_nbr_intra_flag + i * numbytes_row) >> intra_pos)
+ & 1) << i;
+ }
+ left &= num_8_blks_in_bits;
+ }
+ if(u1_top_avail)
+ {
+ x_nbr = x_cur;
+ y_nbr = y_cur - 1;
+
+ pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+ + x_nbr / 64;
+ intra_pos = ((x_nbr / 8) % 8);
+
+ top = (*pu1_nbr_intra_flag >> intra_pos);
+ top &= num_8_blks_in_bits;
+ /*
+ for(i=0;i<num_8_blks;i++)
+ {
+ top |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
+ }
+ */
+ }
+ if(u1_top_rt_avail)
+ {
+ x_nbr = x_cur + trans_size;
+ y_nbr = y_cur - 1;
+
+ pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+ + x_nbr / 64;
+ intra_pos = ((x_nbr / 8) % 8);
+
+ top_right = (*pu1_nbr_intra_flag >> intra_pos);
+ top_right &= num_8_blks_in_bits;
+ /*
+ for(i=0;i<num_8_blks;i++)
+ {
+ top_right |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
+ }
+ */
+ }
+ if(u1_top_lt_avail)
+ {
+ x_nbr = x_cur - 1;
+ y_nbr = y_cur - 1;
+
+ pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
+ + x_nbr / 64;
+ intra_pos = ((x_nbr / 8) % 8);
+
+ top_left = (*pu1_nbr_intra_flag >> intra_pos) & 1;
+ }
+ }
+ else
+ {
+ if(u1_top_avail)
+ top = 0xF;
+ if(u1_top_rt_avail)
+ top_right = 0xF;
+ if(u1_bot_lt_avail)
+ bot_left = 0xF;
+ if(u1_left_avail)
+ left = 0xF;
+ if(u1_top_lt_avail)
+ top_left = 0x1;
+ }
+
+ /* Handling incomplete CTBs */
+ {
+ WORD32 pu_size_limit = MIN(trans_size, 8);
+ WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples
+ - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size)
+ - (ps_tu->b4_pos_x * MIN_TU_SIZE)
+ - (1 << (ps_tu->b3_size + 2));
+ /* ctb_size_top gives number of valid pixels remaining in the current row */
+ WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
+ WORD32 ctb_size_top_bits = (1 << (ctb_size_top / pu_size_limit)) - 1;
+
+ WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+ - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size)
+ - (ps_tu->b4_pos_y * MIN_TU_SIZE)
+ - (1 << (ps_tu->b3_size + 2));
+ /* ctb_size_bot gives number of valid pixels remaining in the current column */
+ WORD32 ctb_size_bot = MIN(ctb_size, rows_remaining);
+ WORD32 ctb_size_bot_bits = (1 << (ctb_size_bot / pu_size_limit)) - 1;
+
+ top_right &= ctb_size_top_bits;
+ bot_left &= ctb_size_bot_bits;
+ }
+
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ * 1 4 4 4 4
+ */
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf(" After constrained intra pred. BL:%d,L:%d,T:%d,TR:%d,TL:%d\n", bot_left, left, top, top_right, top_left);
+#endif
+
+ /*
+ nbr_flags = (top_left << 16) | (gau4_ihevcd_4_bit_reverse[top_right] << 12) | (gau4_ihevcd_4_bit_reverse[top] << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
+ | gau4_ihevcd_4_bit_reverse[bot_left];
+ */
+ nbr_flags = (top_left << 16) | (top_right << 12) | (top << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
+ | gau4_ihevcd_4_bit_reverse[bot_left];
+
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\n Luma nbr flags = %d", nbr_flags);
+#endif
+
+ return nbr_flags;
+
+}
+#if 0
+void ihevcd_itrans_recon_one_coeff(WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 log2_trans_size,
+ TRANSFORM_TYPE trans_type,
+ WORD32 coeff_x,
+ WORD32 coeff_y,
+ WORD16 i2_coeff_value,
+ WORD32 is_luma)
+{
+ WORD32 x, y;
+ WORD32 row, col;
+ WORD32 add, shift;
+ WORD32 quant_out;
+ WORD32 trans_size;
+ WORD16 *pi2_trans_table;
+ WORD32 trans_table_idx;
+ WORD32 itrans_out;
+ WORD32 col_mult = (is_luma == 1) ? 1 : 2;
+
+ x = coeff_x;
+ y = coeff_y;
+ trans_size = (1 << log2_trans_size);
+
+ if(DST_4x4 == trans_type)
+ {
+ trans_table_idx = 0;
+ }
+ else
+ {
+ trans_table_idx = log2_trans_size - 2 + 1;
+ }
+ pi2_trans_table = (WORD16 *)g_ai2_ihevc_trans_tables[trans_table_idx];
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ /* Multiply trans tables values in yth row with quant_out and store in temporary buffer*/
+ for(col = 0; col < trans_size; col++)
+ {
+ pi2_tmp[col] = CLIP_S16(
+ (quant_out * pi2_trans_table[y * trans_size + col ] + add) >> shift);
+ }
+
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+
+ /* Multiply trans tables values in xth row with each value in temerory buffer */
+ for(row = 0; row < trans_size; row++)
+ {
+ for(col = 0; col < trans_size; col++)
+ {
+ itrans_out = CLIP_S16(
+ (pi2_tmp[row] * pi2_trans_table[x * trans_size+ col ] + add)
+ >> shift);
+ pu1_dst[row * dst_strd + col * col_mult] = CLIP_U8( (pu1_pred[row * pred_strd + col * col_mult] + itrans_out));
+ }
+ }
+}
+#endif
+
+WORD32 ihevcd_iquant_itrans_recon_ctb(process_ctxt_t *ps_proc)
+{
+ WORD16 *pi2_scaling_mat;
+ UWORD8 *pu1_y_dst_ctb;
+ UWORD8 *pu1_uv_dst_ctb;
+ WORD32 ctb_size;
+ codec_t *ps_codec;
+ slice_header_t *ps_slice_hdr;
+ tu_t *ps_tu;
+ WORD16 *pi2_ctb_coeff;
+ WORD32 tu_cnt;
+ WORD16 *pi2_tu_coeff;
+ WORD16 *pi2_tmp;
+ WORD32 pic_strd;
+ WORD32 luma_nbr_flags;
+ WORD32 chroma_nbr_flags = 0;
+ UWORD8 u1_luma_pred_mode_first_tu = 0;
+ /* Pointers for generating 2d coeffs from coeff-map */
+ UWORD8 *pu1_tu_coeff_data;
+ /* nbr avail map for CTB */
+ /* 1st bit points to neighbor (left/top_left/bot_left) */
+ /* 1Tb starts at 2nd bit from msb of 2nd value in array, followed by number of min_tu's in that ctb */
+ UWORD32 au4_intra_nbr_avail[MAX_CTB_SIZE / MIN_TU_SIZE
+ + 2 /* Top nbr + bot nbr */]; UWORD32
+ top_avail_bits;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ WORD32 intra_flag;
+ UWORD8 *pu1_pic_intra_flag;
+ /*************************************************************************/
+ /* Contanis scaling matrix offset in the following order in a 1D buffer */
+ /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V */
+ /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V */
+ /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V */
+ /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V */
+ /* Intra 16x16 Y, 16x16 U, 16x16 V */
+ /* Inter 16x16 Y, 16x16 U, 16x16 V */
+ /* Intra 32x32 Y */
+ /* Inter 32x32 Y */
+ /*************************************************************************/
+ WORD32 scaling_mat_offset[] =
+ { 0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992,
+ 1248, 1504, 1760, 2016, 3040 };
+
+ PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED();
+
+ ps_sps = ps_proc->ps_sps;
+ ps_pps = ps_proc->ps_pps;
+ ps_slice_hdr = ps_proc->ps_slice_hdr;
+ ps_codec = ps_proc->ps_codec;
+
+ pu1_y_dst_ctb = ps_proc->pu1_cur_ctb_luma;
+ pu1_uv_dst_ctb = ps_proc->pu1_cur_ctb_chroma;
+
+ pi2_ctb_coeff = ps_proc->pi2_invscan_out;
+
+ ctb_size = (1 << ps_sps->i1_log2_ctb_size);
+ pu1_tu_coeff_data = (UWORD8 *)ps_proc->pv_tu_coeff_data;
+
+ pic_strd = ps_codec->i4_strd;
+
+ pi2_tmp = ps_proc->pi2_itrans_intrmd_buf;
+
+ pi2_tu_coeff = pi2_ctb_coeff;
+
+ ps_tu = ps_proc->ps_tu;
+
+ if((1 == ps_sps->i1_scaling_list_enable_flag) && (1 == ps_pps->i1_pps_scaling_list_data_present_flag))
+ {
+ pi2_scaling_mat = ps_pps->pi2_scaling_mat;
+ }
+ else
+ {
+ pi2_scaling_mat = ps_sps->pi2_scaling_mat;
+ }
+
+ {
+ /* Updating the initial availability map */
+ WORD32 i;
+ UWORD8 u1_left_ctb_avail, u1_top_lt_ctb_avail, u1_top_rt_ctb_avail,
+ u1_top_ctb_avail;
+
+ u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
+ u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
+ u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
+ u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;
+
+ /* Initializing the availability array */
+ memset(au4_intra_nbr_avail, 0,
+ (MAX_CTB_SIZE / MIN_TU_SIZE + 2) * sizeof(UWORD32));
+ /* Initializing the availability array with CTB level availability flags */
+ {
+ WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
+ WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+ for(i = 0; i < ctb_size_left / MIN_TU_SIZE; i++)
+ {
+ au4_intra_nbr_avail[i + 1] = ((UWORD32)u1_left_ctb_avail << 31);
+ }
+ }
+ au4_intra_nbr_avail[0] |= (((UWORD32)u1_top_rt_ctb_avail << 31)
+ >> (1 + ctb_size / MIN_TU_SIZE)); /* 1+ctb_size/4 position bit pos from msb */
+
+ au4_intra_nbr_avail[0] |= ((UWORD32)u1_top_lt_ctb_avail << 31);
+
+ {
+ WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+ WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
+ WORD32 shift = (31 - (ctb_size / MIN_TU_SIZE));
+
+ /* ctb_size_top gives number of valid pixels remaining in the current row */
+ /* Since we need pattern of 1's starting from the MSB, an additional shift */
+ /* is needed */
+ shift += ((ctb_size - ctb_size_top) / MIN_TU_SIZE);
+
+ top_avail_bits = ((1 << (ctb_size_top / MIN_TU_SIZE)) - 1)
+ << shift;
+ }
+ au4_intra_nbr_avail[0] |= (
+ (u1_top_ctb_avail == 1) ? top_avail_bits : 0x0);
+ /* Starting from msb 2nd bit to (1+ctb_size/4) bit, set 1 if top avail,or 0 */
+
+ }
+
+ /* Applying Inverse transform on all the TU's in CTB */
+ for(tu_cnt = 0; tu_cnt < ps_proc->i4_ctb_tu_cnt; tu_cnt++, ps_tu++)
+ {
+ WORD32 transform_skip_flag = 0;
+ WORD32 transform_skip_flag_v = 0;
+ WORD32 num_comp, c_idx, func_idx;
+ WORD32 src_strd, pred_strd, dst_strd;
+ WORD32 qp_div = 0, qp_rem = 0;
+ WORD32 qp_div_v = 0, qp_rem_v = 0;
+ UWORD32 zero_cols = 0, zero_cols_v = 0;
+ UWORD32 zero_rows = 0, zero_rows_v = 0;
+ UWORD32 coeff_type = 0, coeff_type_v = 0;
+ WORD16 i2_coeff_value, i2_coeff_value_v;
+ WORD32 trans_size = 0;
+ TRANSFORM_TYPE e_trans_type;
+ WORD32 log2_y_trans_size_minus_2, log2_uv_trans_size_minus_2;
+ WORD32 log2_trans_size;
+ WORD32 chroma_qp_idx;
+ WORD16 *pi2_src = NULL, *pi2_src_v = NULL;
+ UWORD8 *pu1_pred = NULL, *pu1_pred_v = NULL;
+ UWORD8 *pu1_dst = NULL, *pu1_dst_v = NULL;
+ WORD16 *pi2_dequant_matrix = NULL, *pi2_dequant_matrix_v = NULL;
+ WORD32 tu_x, tu_y;
+ WORD32 tu_y_offset, tu_uv_offset;
+ WORD8 i1_chroma_pic_qp_offset, i1_chroma_slice_qp_offset;
+ UWORD8 u1_cbf = 0, u1_cbf_v = 0, u1_luma_pred_mode, u1_chroma_pred_mode;
+ WORD32 luma_nbr_flags_4x4[4];
+ WORD32 offset;
+ WORD32 pcm_flag;
+ WORD32 chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+ /* If 420SP_VU is chroma format, pred and dst pointer */
+ /* will be added +1 to point to U */
+ WORD32 chroma_yuv420sp_vu_u_offset = 1 * chroma_yuv420sp_vu;
+ /* If 420SP_VU is chroma format, pred and dst pointer */
+ /* will be added U offset of +1 and subtracted 2 */
+ /* to point to V */
+ WORD32 chroma_yuv420sp_vu_v_offset = -2 * chroma_yuv420sp_vu;
+
+ tu_x = ps_tu->b4_pos_x * 4; /* Converting minTU unit to pixel unit */
+ tu_y = ps_tu->b4_pos_y * 4; /* Converting minTU unit to pixel unit */
+ {
+ WORD32 tu_abs_x = (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size) + (tu_x);
+ WORD32 tu_abs_y = (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size) + (tu_y);
+
+ WORD32 numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+
+ pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
+ pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
+ pu1_pic_intra_flag += (tu_abs_x >> 6);
+
+ intra_flag = *pu1_pic_intra_flag;
+ intra_flag &= (1 << ((tu_abs_x >> 3) % 8));
+ }
+
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\n tu_x = %d", tu_x);
+ printf("\n tu_y = %d", tu_y);
+#endif
+
+ u1_luma_pred_mode = ps_tu->b6_luma_intra_mode;
+ u1_chroma_pred_mode = ps_tu->b3_chroma_intra_mode_idx;
+
+ if(u1_chroma_pred_mode != 7)
+ num_comp = 2; /* Y and UV */
+ else
+ num_comp = 1; /* Y */
+
+
+ pcm_flag = 0;
+
+ if((intra_flag) && (u1_luma_pred_mode == INTRA_PRED_NONE))
+ {
+ UWORD8 *pu1_buf;
+ UWORD8 *pu1_y_dst = pu1_y_dst_ctb;
+ UWORD8 *pu1_uv_dst = pu1_uv_dst_ctb;
+ WORD32 i, j;
+ tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+ WORD32 cb_size = 1 << (ps_tu->b3_size + 2);
+
+ /* trans_size is used to update availability after reconstruction */
+ trans_size = cb_size;
+
+ pcm_flag = 1;
+
+ tu_y_offset = tu_x + tu_y * pic_strd;
+ pu1_y_dst += tu_x + tu_y * pic_strd;
+ pu1_uv_dst += tu_x + (tu_y >> 1) * pic_strd;
+
+ /* First byte points to number of coded blocks */
+ pu1_tu_coeff_data++;
+
+ /* Next byte points to scan type */
+ pu1_tu_coeff_data++;
+
+ ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)pu1_tu_coeff_data;
+
+ pu1_buf = (UWORD8 *)&ps_tu_sblk_coeff_data->ai2_level[0];
+ {
+
+ for(i = 0; i < cb_size; i++)
+ {
+ //pu1_y_dst[i * pic_strd + j] = *pu1_buf++;
+ memcpy(&pu1_y_dst[i * pic_strd], pu1_buf, cb_size);
+ pu1_buf += cb_size;
+ }
+
+ pu1_uv_dst = pu1_uv_dst + chroma_yuv420sp_vu_u_offset;
+
+ /* U */
+ for(i = 0; i < cb_size / 2; i++)
+ {
+ for(j = 0; j < cb_size / 2; j++)
+ {
+ pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
+ }
+ }
+
+ pu1_uv_dst = pu1_uv_dst + 1 + chroma_yuv420sp_vu_v_offset;
+
+ /* V */
+ for(i = 0; i < cb_size / 2; i++)
+ {
+ for(j = 0; j < cb_size / 2; j++)
+ {
+ pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
+ }
+ }
+ }
+
+ pu1_tu_coeff_data = pu1_buf;
+
+ }
+
+
+
+
+
+ for(c_idx = 0; c_idx < num_comp; c_idx++)
+ {
+ if(0 == pcm_flag)
+ {
+ /* Initializing variables */
+ pred_strd = pic_strd;
+ dst_strd = pic_strd;
+
+ if(c_idx == 0) /* Y */
+ {
+ log2_y_trans_size_minus_2 = ps_tu->b3_size;
+ trans_size = 1 << (log2_y_trans_size_minus_2 + 2);
+ log2_trans_size = log2_y_trans_size_minus_2 + 2;
+
+ tu_y_offset = tu_x + tu_y * pic_strd;
+
+ pi2_src = pi2_tu_coeff;
+ pu1_pred = pu1_y_dst_ctb + tu_y_offset;
+ pu1_dst = pu1_y_dst_ctb + tu_y_offset;
+
+ /* Calculating scaling matrix offset */
+ offset = log2_y_trans_size_minus_2 * 6
+ + (!intra_flag)
+ * ((log2_y_trans_size_minus_2
+ == 3) ? 1 : 3)
+ + c_idx;
+ pi2_dequant_matrix = pi2_scaling_mat
+ + scaling_mat_offset[offset];
+
+ src_strd = trans_size;
+
+ /* 4x4 transform Luma in INTRA mode is DST */
+ if(log2_y_trans_size_minus_2 == 0 && intra_flag)
+ {
+ func_idx = log2_y_trans_size_minus_2;
+ e_trans_type = DST_4x4;
+ }
+ else
+ {
+ func_idx = log2_y_trans_size_minus_2 + 1;
+ e_trans_type = (TRANSFORM_TYPE)(log2_y_trans_size_minus_2 + 1);
+ }
+
+ qp_div = ps_tu->b7_qp / 6;
+ qp_rem = ps_tu->b7_qp % 6;
+
+ u1_cbf = ps_tu->b1_y_cbf;
+
+ transform_skip_flag = pu1_tu_coeff_data[1] & 1;
+ /* Unpacking coeffs */
+ if(1 == u1_cbf)
+ {
+ pu1_tu_coeff_data = ihevcd_unpack_coeffs(
+ pi2_src, log2_y_trans_size_minus_2 + 2,
+ pu1_tu_coeff_data, pi2_dequant_matrix,
+ qp_rem, qp_div, e_trans_type,
+ ps_tu->b1_transquant_bypass, &zero_cols,
+ &zero_rows, &coeff_type,
+ &i2_coeff_value);
+ }
+
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\nLuma Coeff \n");
+ print_coeff(pi2_src, trans_size);
+#endif
+ }
+ else /* UV interleaved */
+ {
+ /* Chroma :If Transform size is 4x4, keep 4x4 else do transform on (trans_size/2 x trans_size/2) */
+ if(ps_tu->b3_size == 0)
+ {
+ /* Chroma 4x4 is present with 4th luma 4x4 block. For this case chroma postion has to be (luma pos x- 4,luma pos y- 4) */
+ log2_uv_trans_size_minus_2 = ps_tu->b3_size;
+ tu_uv_offset = (tu_x - 4) + ((tu_y - 4) / 2) * pic_strd;
+ }
+ else
+ {
+ log2_uv_trans_size_minus_2 = ps_tu->b3_size - 1;
+ tu_uv_offset = tu_x + (tu_y >> 1) * pic_strd;
+ }
+ trans_size = 1 << (log2_uv_trans_size_minus_2 + 2);
+ log2_trans_size = log2_uv_trans_size_minus_2 + 2;
+
+ pi2_src = pi2_tu_coeff;
+ pi2_src_v = pi2_tu_coeff + trans_size * trans_size;
+ pu1_pred = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
+ pu1_pred_v = pu1_pred + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
+ pu1_dst = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
+ pu1_dst_v = pu1_dst + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
+
+ /*TODO: Add support for choosing different tables for U and V,
+ * change this to a single array to handle flat/default/custom, intra/inter, luma/chroma and various sizes
+ */
+ /* Calculating scaling matrix offset */
+ /* ((log2_uv_trans_size_minus_2 == 3) ? 1:3) condition check is not needed, since
+ * max uv trans size is 16x16
+ */
+ offset = log2_uv_trans_size_minus_2 * 6
+ + (!intra_flag) * 3 + c_idx;
+ pi2_dequant_matrix = pi2_scaling_mat
+ + scaling_mat_offset[offset];
+ pi2_dequant_matrix_v = pi2_scaling_mat
+ + scaling_mat_offset[offset + 1];
+
+ src_strd = trans_size;
+
+ func_idx = 1 + 4 + log2_uv_trans_size_minus_2; /* DST func + Y funcs + cur func index*/
+ e_trans_type = (TRANSFORM_TYPE)(log2_uv_trans_size_minus_2 + 1);
+ /* QP for U */
+ i1_chroma_pic_qp_offset = ps_pps->i1_pic_cb_qp_offset;
+ i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cb_qp_offset;
+ u1_cbf = ps_tu->b1_cb_cbf;
+
+ chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
+ + i1_chroma_slice_qp_offset;
+ chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
+ qp_div = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
+ qp_rem = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
+
+ /* QP for V */
+ i1_chroma_pic_qp_offset = ps_pps->i1_pic_cr_qp_offset;
+ i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cr_qp_offset;
+ u1_cbf_v = ps_tu->b1_cr_cbf;
+
+ chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
+ + i1_chroma_slice_qp_offset;
+ chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
+ qp_div_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
+ qp_rem_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
+
+ /* Unpacking coeffs */
+ transform_skip_flag = pu1_tu_coeff_data[1] & 1;
+ if(1 == u1_cbf)
+ {
+ pu1_tu_coeff_data = ihevcd_unpack_coeffs(
+ pi2_src, log2_uv_trans_size_minus_2 + 2,
+ pu1_tu_coeff_data, pi2_dequant_matrix,
+ qp_rem, qp_div, e_trans_type,
+ ps_tu->b1_transquant_bypass, &zero_cols,
+ &zero_rows, &coeff_type,
+ &i2_coeff_value);
+ }
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\nChroma Coeff U \n");
+ print_coeff(pi2_src, trans_size);
+#endif
+
+ transform_skip_flag_v = pu1_tu_coeff_data[1] & 1;
+ if(1 == u1_cbf_v)
+ {
+ pu1_tu_coeff_data = ihevcd_unpack_coeffs(
+ pi2_src_v, log2_uv_trans_size_minus_2 + 2,
+ pu1_tu_coeff_data, pi2_dequant_matrix_v,
+ qp_rem_v, qp_div_v, e_trans_type,
+ ps_tu->b1_transquant_bypass, &zero_cols_v,
+ &zero_rows_v, &coeff_type_v, &i2_coeff_value_v);
+ }
+ }
+ /***************************************************************/
+ /****************** Intra Prediction **************************/
+ /***************************************************************/
+ if(intra_flag) /* Intra */
+ {
+ UWORD8 au1_ref_sub_out[(MAX_TU_SIZE * 2 * 2) + 4];
+ UWORD8 *pu1_top_left, *pu1_top, *pu1_left;
+ WORD32 luma_pred_func_idx, chroma_pred_func_idx;
+
+ /* Get the neighbour availability flags */
+ /* Done for only Y */
+ if(c_idx == 0)
+ {
+ /* Get neighbor availability for Y only */
+ luma_nbr_flags = ihevcd_get_intra_nbr_flag(ps_proc,
+ ps_tu,
+ au4_intra_nbr_avail,
+ ps_sps->i2_pic_width_in_luma_samples,
+ ps_pps->i1_constrained_intra_pred_flag,
+ trans_size,
+ ctb_size);
+
+ if(trans_size == 4)
+ luma_nbr_flags_4x4[(ps_tu->b4_pos_x % 2) + (ps_tu->b4_pos_y % 2) * 2] = luma_nbr_flags;
+
+ if((ps_tu->b4_pos_x % 2 == 0) && (ps_tu->b4_pos_y % 2 == 0))
+ {
+ chroma_nbr_flags = luma_nbr_flags;
+ }
+
+ /* Initializing nbr pointers */
+ pu1_top = pu1_pred - pic_strd;
+ pu1_left = pu1_pred - 1;
+ pu1_top_left = pu1_pred - pic_strd - 1;
+
+ /* call reference array substitution */
+ if(luma_nbr_flags == 0x1ffff)
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr(
+ pu1_top_left,
+ pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, au1_ref_sub_out, 1);
+ else
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr(
+ pu1_top_left,
+ pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, au1_ref_sub_out, 1);
+
+ /* call reference filtering */
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr(
+ au1_ref_sub_out, trans_size,
+ au1_ref_sub_out,
+ u1_luma_pred_mode, ps_sps->i1_strong_intra_smoothing_enable_flag);
+
+ /* use the look up to get the function idx */
+ luma_pred_func_idx = g_i4_ip_funcs[u1_luma_pred_mode];
+
+ /* call the intra prediction function */
+ ps_codec->apf_intra_pred_luma[luma_pred_func_idx](au1_ref_sub_out, 1, pu1_pred, pred_strd, trans_size, u1_luma_pred_mode);
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\n Luma Pred mode = %d, qp = %d\n", u1_luma_pred_mode, qp_div * 6 + qp_rem);
+ print_dst(pu1_pred, pred_strd, trans_size, 1);
+#endif
+ }
+ else
+ {
+ /* In case of yuv420sp_vu, prediction happens as usual. */
+ /* So point the pu1_pred pointer to original prediction pointer */
+ UWORD8 *pu1_pred_orig = pu1_pred - chroma_yuv420sp_vu_u_offset;
+
+ /* Top-Left | Top-Right | Top | Left | Bottom-Left
+ * 1 4 4 4 4
+ *
+ * Generating chroma_nbr_flags depending upon the transform size */
+ if(ps_tu->b3_size == 0)
+ {
+ /* Take TL,T,L flags of First luma 4x4 block */
+ chroma_nbr_flags = (luma_nbr_flags_4x4[0] & 0x10FF0);
+ /* Take TR flags of Second luma 4x4 block */
+ chroma_nbr_flags |= (luma_nbr_flags_4x4[1] & 0x0F000);
+ /* Take BL flags of Third luma 4x4 block */
+ chroma_nbr_flags |= (luma_nbr_flags_4x4[2] & 0x0000F);
+ }
+
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\n Chroma nbr flags = %d", chroma_nbr_flags);
+#endif
+ /* Initializing nbr pointers */
+ pu1_top = pu1_pred_orig - pic_strd;
+ pu1_left = pu1_pred_orig - 2;
+ pu1_top_left = pu1_pred_orig - pic_strd - 2;
+
+ /* Chroma pred mode derivation from luma pred mode */
+ {
+ tu_t *ps_tu_tmp = ps_tu;
+ while(!ps_tu_tmp->b1_first_tu_in_cu)
+ {
+ ps_tu_tmp--;
+ }
+ u1_luma_pred_mode_first_tu = ps_tu_tmp->b6_luma_intra_mode;
+ }
+ if(4 == u1_chroma_pred_mode)
+ u1_chroma_pred_mode = u1_luma_pred_mode_first_tu;
+ else
+ {
+ u1_chroma_pred_mode = gau1_intra_pred_chroma_modes[u1_chroma_pred_mode];
+
+ if(u1_chroma_pred_mode ==
+ u1_luma_pred_mode_first_tu)
+ {
+ u1_chroma_pred_mode = INTRA_ANGULAR(34);
+ }
+ }
+
+ /* call the chroma reference array substitution */
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr(
+ pu1_top_left,
+ pu1_top, pu1_left, pic_strd, trans_size, chroma_nbr_flags, au1_ref_sub_out, 1);
+
+ /* use the look up to get the function idx */
+ chroma_pred_func_idx =
+ g_i4_ip_funcs[u1_chroma_pred_mode];
+
+ /* call the intra prediction function */
+ ps_codec->apf_intra_pred_chroma[chroma_pred_func_idx](au1_ref_sub_out, 1, pu1_pred_orig, pred_strd, trans_size, u1_chroma_pred_mode);
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\n Chroma U Pred mode = %d,qp = %d \n", u1_chroma_pred_mode, qp_div * 6 + qp_rem);
+ print_dst(pu1_pred_orig, pred_strd, trans_size, 0);
+#endif
+ }
+ }
+
+ /* Updating number of transform types */
+ STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx);
+
+ /* IQ, IT and Recon for Y if c_idx == 0, and U if c_idx !=0 */
+ if(1 == u1_cbf)
+ {
+ if(ps_tu->b1_transquant_bypass || transform_skip_flag)
+ {
+ /* Recon */
+ ps_codec->apf_recon[func_idx](pi2_src, pu1_pred, pu1_dst,
+ src_strd, pred_strd, dst_strd,
+ zero_cols);
+ }
+ else
+ {
+
+ /* Updating coded number of transform types(excluding trans skip and trans quant skip) */
+ STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
+
+ /* iQuant , iTrans and Recon */
+ if((0 == coeff_type))
+ {
+ ps_codec->apf_itrans_recon[func_idx](pi2_src, pi2_tmp,
+ pu1_pred, pu1_dst,
+ src_strd, pred_strd,
+ dst_strd, zero_cols,
+ zero_rows);
+ }
+ else /* DC only */
+ {
+ STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
+ ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred, pu1_dst,
+ pred_strd, dst_strd,
+ log2_trans_size,
+ i2_coeff_value);
+ }
+ }
+ }
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\n Recon data \n");
+ print_dst(pu1_dst, dst_strd, trans_size, !c_idx);
+#endif
+ /* IQ, IT and Recon for V */
+ if(c_idx != 0)
+ {
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\nChroma Coeff V \n");
+ print_coeff(pi2_src_v, trans_size);
+ printf("\n Chroma V Pred mode = %d,qp = %d \n",
+ u1_chroma_pred_mode, qp_div_v * 6 + qp_rem_v);
+ print_dst(pu1_pred + 1, dst_strd, trans_size, 0);
+#endif
+ if(1 == u1_cbf_v)
+ {
+ if(ps_tu->b1_transquant_bypass || transform_skip_flag_v)
+ {
+ /* Recon */
+ ps_codec->apf_recon[func_idx](pi2_src_v, pu1_pred_v,
+ pu1_dst_v, src_strd,
+ pred_strd, dst_strd,
+ zero_cols_v);
+ }
+ else
+ {
+ /* Updating number of transform types */
+ STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
+
+ /* iQuant , iTrans and Recon */
+ if((0 == coeff_type_v))
+ {
+ ps_codec->apf_itrans_recon[func_idx](pi2_src_v,
+ pi2_tmp,
+ pu1_pred_v,
+ pu1_dst_v,
+ src_strd,
+ pred_strd,
+ dst_strd,
+ zero_cols_v,
+ zero_rows_v);
+ }
+ else /* DC only */
+ {
+ STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
+ ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred_v, pu1_dst_v,
+ pred_strd, dst_strd,
+ log2_trans_size,
+ i2_coeff_value_v);
+ }
+ }
+ }
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\n Recon data \n");
+ print_dst(pu1_dst + 1, dst_strd, trans_size, 0);
+#endif
+ }
+ }
+
+ /* Neighbor availability inside CTB */
+ /* 1bit per 4x4. Indicates whether that 4x4 block has been reconstructed(avialable) */
+ /* Used for neighbor availability in intra pred */
+ if(c_idx == 0)
+ {
+ WORD32 i;
+ WORD32 trans_in_min_tu;
+ UWORD32 cur_tu_in_bits;
+ UWORD32 cur_tu_avail_flag;
+
+ trans_in_min_tu = trans_size / MIN_TU_SIZE;
+ cur_tu_in_bits = (1 << trans_in_min_tu) - 1;
+ cur_tu_in_bits = cur_tu_in_bits << (32 - trans_in_min_tu);
+
+ cur_tu_avail_flag = cur_tu_in_bits >> (ps_tu->b4_pos_x + 1);
+
+ for(i = 0; i < trans_in_min_tu; i++)
+ au4_intra_nbr_avail[1 + ps_tu->b4_pos_y + i] |=
+ cur_tu_avail_flag;
+ }
+ }
+ }
+ ps_proc->pv_tu_coeff_data = pu1_tu_coeff_data;
+
+ return ps_proc->i4_ctb_tu_cnt;
+}
+
diff --git a/decoder/ihevcd_iquant_itrans_recon_ctb.h b/decoder/ihevcd_iquant_itrans_recon_ctb.h
new file mode 100644
index 0000000..fde647f
--- /dev/null
+++ b/decoder/ihevcd_iquant_itrans_recon_ctb.h
@@ -0,0 +1,67 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_iquant_itrans_recon_ctb.h
+ *
+ * @brief
+ * Definitions related to inverse transform functions
+ *
+ * @author
+ * Naveen S R
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_IQUANT_ITRANS_RECON_CTB_H_
+#define _IHEVCD_IQUANT_ITRANS_RECON_CTB_H_
+
+#define MAX_NUM_IP_MODES 35
+
+typedef enum
+{
+ IP_FUNC_MODE_0 = 1,
+ IP_FUNC_MODE_1,
+ IP_FUNC_MODE_2,
+ IP_FUNC_MODE_3TO9,
+ IP_FUNC_MODE_10,
+ IP_FUNC_MODE_11TO17,
+ IP_FUNC_MODE_18_34,
+ IP_FUNC_MODE_19TO25,
+ IP_FUNC_MODE_26,
+ IP_FUNC_MODE_27TO33,
+
+ NUM_IP_FUNCS
+
+}IP_FUNCS_T;
+
+
+typedef enum
+{
+ DST_4x4, DCT_4x4, DCT_8x8, DCT_16x16, DCT_32x32, SKIP_64x64
+}TRANSFORM_TYPE;
+
+WORD32 ihevcd_iquant_itrans_recon_ctb(process_ctxt_t *ps_proc);
+
+#endif /* _IHEVCD_IQUANT_ITRANS_RECON_CTB_H_ */
diff --git a/decoder/ihevcd_itrans_recon_dc.c b/decoder/ihevcd_itrans_recon_dc.c
new file mode 100644
index 0000000..ae37e40
--- /dev/null
+++ b/decoder/ihevcd_itrans_recon_dc.c
@@ -0,0 +1,146 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_itrans_recon_dc.c
+ *
+ * @brief
+ * Contains functions for DC inverse transform and reconstruction
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_itrans_recon_dc_luma()
+ * - ihevcd_itrans_recon_dc_chroma()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_bitstream.h"
+#include "ihevc_common_tables.h"
+
+/* Intra pred includes */
+#include "ihevc_intra_pred.h"
+
+/* Inverse transform common module includes */
+#include "ihevc_trans_tables.h"
+#include "ihevc_trans_macros.h"
+#include "ihevc_itrans_recon.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_itrans_recon.h"
+#include "ihevc_chroma_recon.h"
+
+/* Decoder includes */
+#include "ihevcd_common_tables.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_statistics.h"
+#include "ihevcd_itrans_recon_dc.h"
+
+
+
+void ihevcd_itrans_recon_dc_luma(UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 log2_trans_size,
+ WORD16 i2_coeff_value)
+{
+ WORD32 row, col;
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ for(row = 0; row < trans_size; row++)
+ for(col = 0; col < trans_size; col++)
+ pu1_dst[row * dst_strd + col] = CLIP_U8((pu1_pred[row * pred_strd + col] + dc_value));
+
+}
+
+
+void ihevcd_itrans_recon_dc_chroma(UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 log2_trans_size,
+ WORD16 i2_coeff_value)
+{
+ WORD32 row, col;
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ for(row = 0; row < trans_size; row++)
+ for(col = 0; col < trans_size; col++)
+ pu1_dst[row * dst_strd + (col << 1)] = CLIP_U8((pu1_pred[row * pred_strd + (col << 1)] + dc_value));
+
+}
+
+
diff --git a/decoder/ihevcd_itrans_recon_dc.h b/decoder/ihevcd_itrans_recon_dc.h
new file mode 100644
index 0000000..0e64a9e
--- /dev/null
+++ b/decoder/ihevcd_itrans_recon_dc.h
@@ -0,0 +1,77 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_itrans_recon.h
+*
+* @brief
+* Header for itrans recon dc functions
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_ITRANS_RECON_DC_H_
+#define _IHEVCD_ITRANS_RECON_DC_H_
+
+typedef void ihevcd_itrans_recon_dc_luma_ft(UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 log2_trans_size,
+ WORD16 i2_coeff_value);
+typedef void ihevcd_itrans_recon_dc_chroma_ft(UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 log2_trans_size,
+ WORD16 i2_coeff_value);
+
+/* C function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma;
+
+/* A9Q function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_a9q;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_a9q;
+
+/* A9A function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_a9a;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_a9a;
+
+/* SSSE3 function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_ssse3;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_ssse3;
+
+/* SSS4.2 function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_sse42;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_sse42;
+
+/* armv8 function declarations */
+ihevcd_itrans_recon_dc_luma_ft ihevcd_itrans_recon_dc_luma_av8;
+ihevcd_itrans_recon_dc_chroma_ft ihevcd_itrans_recon_dc_chroma_av8;
+
+#endif /* _IHEVCD_ITRANS_RECON_DC_H_ */
diff --git a/decoder/ihevcd_ittiam_logo.c b/decoder/ihevcd_ittiam_logo.c
new file mode 100644
index 0000000..269585b
--- /dev/null
+++ b/decoder/ihevcd_ittiam_logo.c
@@ -0,0 +1,4636 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/* */
+/* File Name : ihevcd_ittiam_logo.c */
+/* */
+/* Description : This file contains all the necessary tables for */
+/* inserting ittiam logo to a yuv buffer */
+/* */
+/* List of Functions : memcpy_2d */
+/* insert_logo */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 10 10 2005 Ittiam Draft */
+/* */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+
+/* User include files */
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ihevcd_ittiam_logo.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include <string.h>
+
+#ifdef LOGO_EN
+#define CODEC_LOGO 0
+
+const UWORD8 gau1_ihevcd_codec_logo_y[] =
+{
+ 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xD9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0xC1,
+ 0xC1, 0xC1, 0xC1, 0xD2, 0xFF, 0xFF, 0xFF, 0xFF, 0xEA, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1,
+ 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xF5, 0xE2, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xD0, 0xFE, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xCF, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xE4, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xEB, 0xCC, 0xB3, 0xA5, 0x9E, 0xA1, 0xAD, 0xC1, 0xDD, 0xF7,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xEC, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF3, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF3, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0xA1, 0xF1, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xEC, 0xB9, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xA5,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFE, 0xB6, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD4, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xD0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBA, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFE, 0xD9, 0xA2, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xDC, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB1, 0xFD, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFC, 0xAD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xE2, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFE, 0xCE, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xF8, 0xA8, 0x9C, 0x9C, 0x9C, 0x9C, 0x9E, 0xEA, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE7, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0xAB, 0xFC, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xD5, 0x9E, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBD, 0xC9,
+ 0xC9, 0xC9, 0xC9, 0xC9, 0xC9, 0xF6, 0xFF, 0xFF, 0xC8, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC7, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC2, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xCF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xEC, 0xA3, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA1, 0xA4, 0xA4, 0xA1, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEC, 0x9E, 0x9C, 0x9C, 0x9C, 0x9C, 0xA8, 0xF9,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xA5, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xBA, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xD2, 0xF0, 0xFC, 0xFC, 0xF0, 0xD3, 0xA7, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB6, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xDE,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xDA, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBD, 0xFF, 0xFF, 0xFF, 0xFF, 0xEE,
+ 0xA1, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB1, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF6, 0xB6,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDC, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBA,
+ 0xFF, 0xFF, 0xFF, 0xFE, 0xB6, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xE4, 0xFF, 0xFF, 0xFF, 0xFF, 0xD2,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF8,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0xB3, 0x9E, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xAC, 0xB3,
+ 0xB3, 0xB3, 0xB3, 0xB3, 0xC1, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xA7, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2,
+ 0xF2, 0xFF, 0xFF, 0xEF, 0xA1, 0x9C, 0x9C, 0x9C, 0x9C, 0xAD, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xBA,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xCC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC8, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0xD3, 0xFF, 0xFF, 0xCC, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAB,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xEA, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEF, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0xAF, 0xFE, 0xFC, 0xAA, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB4, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9D, 0xEA, 0xE3, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0xBF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA4,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0xAF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDB, 0x9D, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0xC6, 0xBF, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xAB,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xEB, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBF, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xFA, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC6, 0xD7,
+ 0xD7, 0xD7, 0xD7, 0xD7, 0xDE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFA, 0xA5, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0xA4, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0xAD, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBA,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xCD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD4, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xD0,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xF2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF8,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEB, 0xA0, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA2, 0xF7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEE,
+ 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB1, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xB6,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xE2, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB4, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xBA, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xD3, 0xF0, 0xFC, 0xFD, 0xF1, 0xD4, 0xA7, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xBD, 0xCB,
+ 0xCB, 0xCB, 0xCB, 0xCB, 0xCB, 0xF6, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDB, 0x9D, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9E, 0xE7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xEB, 0xA3, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xA1, 0xA5, 0xA5, 0xA1, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF7, 0xA7, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xB0, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xD4, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC6, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xD6, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFE, 0xCD, 0x9D, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xEB, 0xA0,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0xA5, 0xF6, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFE, 0xD6, 0xA2, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xA5, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0xB7, 0xFF, 0xFF, 0xFF, 0xFF, 0xDD, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xB4,
+ 0x9C, 0x9C, 0x9C, 0x9C, 0xC4, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xEC, 0xB8, 0xA0, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9C, 0x9D, 0xA4,
+ 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xD7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC6, 0xC1,
+ 0xC1, 0xC1, 0xC1, 0xD2, 0xFF, 0xFF, 0xFF, 0xFF, 0xEA, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xC1,
+ 0xC1, 0xC1, 0xC1, 0xC1, 0xC1, 0xF5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE3,
+ 0xC1, 0xC1, 0xC1, 0xC1, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xEA, 0xCB, 0xB2, 0xA3, 0x9E, 0xA1, 0xAC, 0xC0, 0xDC, 0xF7,
+
+};
+const UWORD8 gau1_ihevcd_codec_logo_420p_u[] =
+{
+ 0x4D, 0x4D, 0x56, 0x80, 0x80, 0x80, 0x80, 0x4F, 0x4D, 0x53, 0x80, 0x80, 0x5E, 0x4D, 0x4D, 0x4D,
+ 0x4D, 0x4D, 0x62, 0x5F, 0x4D, 0x4D, 0x68, 0x80, 0x80, 0x80, 0x80, 0x67, 0x4D, 0x4D, 0x61, 0x80,
+ 0x80, 0x81, 0x70, 0x56, 0x46, 0x42, 0x4A, 0x5B, 0x40, 0x40, 0x4B, 0x80, 0x80, 0x80, 0x80, 0x42,
+ 0x40, 0x48, 0x80, 0x80, 0x55, 0x40, 0x40, 0x3E, 0x3E, 0x3E, 0x59, 0x6D, 0x3F, 0x40, 0x4C, 0x7F,
+ 0x80, 0x80, 0x7F, 0x4B, 0x40, 0x3F, 0x70, 0x80, 0x80, 0x62, 0x42, 0x3F, 0x40, 0x40, 0x3F, 0x3F,
+ 0x41, 0x41, 0x4C, 0x80, 0x80, 0x80, 0x80, 0x44, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x4F,
+ 0x51, 0x51, 0x65, 0x7F, 0x49, 0x41, 0x41, 0x73, 0x80, 0x80, 0x72, 0x41, 0x41, 0x4B, 0x7F, 0x80,
+ 0x67, 0x40, 0x40, 0x3F, 0x43, 0x47, 0x43, 0x3F, 0x41, 0x41, 0x4D, 0x82, 0x82, 0x82, 0x82, 0x44,
+ 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x78, 0x82, 0x82, 0x80, 0x80, 0x61, 0x41, 0x41, 0x5C,
+ 0x80, 0x80, 0x5A, 0x41, 0x41, 0x64, 0x80, 0x7D, 0x46, 0x41, 0x40, 0x54, 0x73, 0x7B, 0x73, 0x56,
+ 0x41, 0x41, 0x48, 0x66, 0x66, 0x66, 0x66, 0x42, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x61,
+ 0x67, 0x67, 0x76, 0x80, 0x79, 0x42, 0x41, 0x46, 0x7E, 0x7D, 0x45, 0x41, 0x43, 0x7B, 0x80, 0x6D,
+ 0x40, 0x41, 0x4A, 0x7E, 0x81, 0x80, 0x81, 0x7F, 0x41, 0x41, 0x41, 0x3E, 0x3E, 0x3E, 0x3E, 0x41,
+ 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x3F, 0x3F, 0x3F, 0x66, 0x80, 0x80, 0x54, 0x41, 0x40,
+ 0x6C, 0x6A, 0x41, 0x41, 0x58, 0x80, 0x80, 0x64, 0x41, 0x41, 0x5C, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x41, 0x41, 0x42, 0x45, 0x45, 0x45, 0x45, 0x41, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x42,
+ 0x42, 0x42, 0x67, 0x80, 0x80, 0x6F, 0x40, 0x41, 0x54, 0x51, 0x41, 0x41, 0x72, 0x80, 0x80, 0x63,
+ 0x41, 0x41, 0x5C, 0x80, 0x80, 0x80, 0x80, 0x80, 0x41, 0x41, 0x4B, 0x7A, 0x7A, 0x7A, 0x7A, 0x43,
+ 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x6A, 0x71, 0x71, 0x7A, 0x80, 0x80, 0x7F, 0x49, 0x41,
+ 0x42, 0x41, 0x41, 0x4B, 0x80, 0x80, 0x80, 0x6D, 0x40, 0x41, 0x4A, 0x7E, 0x81, 0x80, 0x81, 0x7F,
+ 0x41, 0x41, 0x4C, 0x81, 0x81, 0x81, 0x81, 0x44, 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x78,
+ 0x81, 0x81, 0x80, 0x80, 0x80, 0x80, 0x61, 0x41, 0x41, 0x41, 0x41, 0x66, 0x80, 0x80, 0x80, 0x7D,
+ 0x46, 0x41, 0x40, 0x54, 0x73, 0x7B, 0x74, 0x56, 0x41, 0x41, 0x4C, 0x80, 0x80, 0x80, 0x80, 0x44,
+ 0x41, 0x49, 0x80, 0x80, 0x56, 0x41, 0x41, 0x4F, 0x51, 0x51, 0x65, 0x80, 0x80, 0x80, 0x79, 0x42,
+ 0x41, 0x41, 0x44, 0x7B, 0x80, 0x80, 0x80, 0x80, 0x67, 0x40, 0x40, 0x3F, 0x43, 0x47, 0x43, 0x3F,
+ 0x40, 0x40, 0x4B, 0x80, 0x80, 0x80, 0x80, 0x42, 0x40, 0x48, 0x80, 0x80, 0x55, 0x40, 0x40, 0x3E,
+ 0x3E, 0x3E, 0x59, 0x80, 0x80, 0x80, 0x80, 0x55, 0x40, 0x40, 0x59, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x61, 0x42, 0x3F, 0x40, 0x40, 0x3F, 0x3F, 0x4D, 0x4D, 0x56, 0x80, 0x80, 0x80, 0x80, 0x4F,
+ 0x4D, 0x54, 0x80, 0x80, 0x5E, 0x4D, 0x4D, 0x4D, 0x4D, 0x4D, 0x62, 0x80, 0x80, 0x80, 0x80, 0x70,
+ 0x4D, 0x4D, 0x74, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x70, 0x55, 0x46, 0x42, 0x4A, 0x5B,
+};
+
+const UWORD8 gau1_ihevcd_codec_logo_420p_v[] =
+{
+ 0xB9, 0xB9, 0xAE, 0x80, 0x80, 0x80, 0x80, 0xB6, 0xB9, 0xB1, 0x80, 0x80, 0xA6, 0xB9, 0xB9, 0xB9,
+ 0xB9, 0xB9, 0xA1, 0xA4, 0xB9, 0xB9, 0x9B, 0x7F, 0x80, 0x80, 0x7F, 0x9C, 0xB9, 0xB9, 0xA3, 0x80,
+ 0x7F, 0x7E, 0x92, 0xAF, 0xC1, 0xC5, 0xBD, 0xA9, 0xC8, 0xC8, 0xBB, 0x80, 0x80, 0x80, 0x80, 0xC4,
+ 0xC8, 0xBD, 0x80, 0x80, 0xAF, 0xC8, 0xC8, 0xCA, 0xCA, 0xCA, 0xAB, 0x94, 0xC8, 0xC8, 0xB9, 0x80,
+ 0x80, 0x80, 0x80, 0xBA, 0xC8, 0xC8, 0x92, 0x80, 0x7F, 0xA1, 0xC4, 0xC9, 0xC8, 0xC7, 0xC8, 0xC9,
+ 0xC6, 0xC6, 0xBA, 0x80, 0x80, 0x80, 0x80, 0xC3, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xB6,
+ 0xB4, 0xB4, 0x9E, 0x80, 0xBC, 0xC6, 0xC6, 0x8D, 0x80, 0x80, 0x8E, 0xC6, 0xC6, 0xBA, 0x80, 0x7F,
+ 0x9B, 0xC7, 0xC7, 0xC8, 0xC3, 0xBF, 0xC3, 0xC8, 0xC6, 0xC6, 0xB9, 0x7C, 0x7C, 0x7C, 0x7C, 0xC2,
+ 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0x88, 0x7D, 0x7D, 0x7F, 0x7F, 0xA1, 0xC6, 0xC6, 0xA8,
+ 0x7F, 0x7F, 0xAA, 0xC6, 0xC6, 0x9E, 0x7F, 0x83, 0xC0, 0xC6, 0xC7, 0xB0, 0x8D, 0x84, 0x8D, 0xAF,
+ 0xC6, 0xC6, 0xBE, 0x9B, 0x9B, 0x9B, 0x9B, 0xC4, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xA1,
+ 0x9B, 0x9B, 0x8A, 0x80, 0x87, 0xC5, 0xC6, 0xBF, 0x82, 0x82, 0xC1, 0xC6, 0xC2, 0x85, 0x80, 0x94,
+ 0xC7, 0xC6, 0xBC, 0x82, 0x7E, 0x7F, 0x7E, 0x80, 0xC6, 0xC6, 0xC6, 0xC9, 0xC9, 0xC9, 0xC9, 0xC6,
+ 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xC8, 0xC9, 0xC9, 0x9D, 0x80, 0x7F, 0xB0, 0xC6, 0xC7,
+ 0x96, 0x98, 0xC6, 0xC6, 0xAC, 0x7F, 0x80, 0x9F, 0xC6, 0xC6, 0xA7, 0x7F, 0x80, 0x80, 0x80, 0x7F,
+ 0xC6, 0xC6, 0xC5, 0xC2, 0xC2, 0xC2, 0xC2, 0xC6, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xC5,
+ 0xC5, 0xC5, 0x9C, 0x80, 0x80, 0x93, 0xC7, 0xC6, 0xB1, 0xB4, 0xC6, 0xC7, 0x8F, 0x80, 0x80, 0x9F,
+ 0xC6, 0xC6, 0xA7, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0xC6, 0xC6, 0xBA, 0x86, 0x86, 0x86, 0x86, 0xC3,
+ 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0x98, 0x90, 0x90, 0x86, 0x80, 0x80, 0x80, 0xBD, 0xC6,
+ 0xC5, 0xC6, 0xC6, 0xB9, 0x7F, 0x80, 0x80, 0x95, 0xC7, 0xC6, 0xBC, 0x82, 0x7E, 0x7F, 0x7E, 0x80,
+ 0xC6, 0xC6, 0xB9, 0x7E, 0x7E, 0x7E, 0x7E, 0xC3, 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0x88,
+ 0x7E, 0x7E, 0x7F, 0x80, 0x80, 0x7F, 0xA2, 0xC6, 0xC6, 0xC6, 0xC6, 0x9C, 0x7F, 0x80, 0x80, 0x83,
+ 0xC0, 0xC6, 0xC7, 0xB0, 0x8D, 0x84, 0x8D, 0xAE, 0xC6, 0xC6, 0xBA, 0x80, 0x80, 0x80, 0x80, 0xC3,
+ 0xC6, 0xBC, 0x80, 0x80, 0xAE, 0xC6, 0xC6, 0xB6, 0xB3, 0xB3, 0x9D, 0x80, 0x80, 0x80, 0x87, 0xC5,
+ 0xC6, 0xC6, 0xC2, 0x84, 0x80, 0x80, 0x80, 0x7F, 0x9B, 0xC7, 0xC7, 0xC8, 0xC3, 0xBF, 0xC3, 0xC8,
+ 0xC8, 0xC8, 0xBB, 0x80, 0x80, 0x80, 0x80, 0xC4, 0xC8, 0xBD, 0x80, 0x80, 0xAF, 0xC8, 0xC8, 0xCA,
+ 0xCA, 0xCA, 0xAB, 0x80, 0x80, 0x80, 0x7F, 0xB1, 0xC8, 0xC8, 0xAB, 0x7F, 0x80, 0x80, 0x80, 0x80,
+ 0x7F, 0xA1, 0xC4, 0xC8, 0xC8, 0xC7, 0xC8, 0xC9, 0xB9, 0xB9, 0xAF, 0x80, 0x80, 0x80, 0x80, 0xB7,
+ 0xB9, 0xB2, 0x80, 0x80, 0xA6, 0xB9, 0xB9, 0xB9, 0xB9, 0xB9, 0xA2, 0x80, 0x80, 0x80, 0x80, 0x93,
+ 0xB9, 0xB9, 0x8E, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x92, 0xB0, 0xC1, 0xC5, 0xBD, 0xAA,
+};
+
+const UWORD8 gau1_ihevcd_codec_logo_420sp_uv[] =
+{
+ 0x4D, 0xB9, 0x4D, 0xB9, 0x56, 0xAE, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x4F, 0xB6,
+ 0x4D, 0xB9, 0x53, 0xB1, 0x80, 0x80, 0x80, 0x80, 0x5E, 0xA6, 0x4D, 0xB9, 0x4D, 0xB9, 0x4D, 0xB9,
+ 0x4D, 0xB9, 0x4D, 0xB9, 0x62, 0xA1, 0x5F, 0xA4, 0x4D, 0xB9, 0x4D, 0xB9, 0x68, 0x9B, 0x80, 0x7F,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x67, 0x9C, 0x4D, 0xB9, 0x4D, 0xB9, 0x61, 0xA3, 0x80, 0x80,
+ 0x80, 0x7F, 0x81, 0x7E, 0x70, 0x92, 0x56, 0xAF, 0x46, 0xC1, 0x42, 0xC5, 0x4A, 0xBD, 0x5B, 0xA9,
+ 0x40, 0xC8, 0x40, 0xC8, 0x4B, 0xBB, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x42, 0xC4,
+ 0x40, 0xC8, 0x48, 0xBD, 0x80, 0x80, 0x80, 0x80, 0x55, 0xAF, 0x40, 0xC8, 0x40, 0xC8, 0x3E, 0xCA,
+ 0x3E, 0xCA, 0x3E, 0xCA, 0x59, 0xAB, 0x6D, 0x94, 0x3F, 0xC8, 0x40, 0xC8, 0x4C, 0xB9, 0x7F, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x4B, 0xBA, 0x40, 0xC8, 0x3F, 0xC8, 0x70, 0x92, 0x80, 0x80,
+ 0x80, 0x7F, 0x62, 0xA1, 0x42, 0xC4, 0x3F, 0xC9, 0x40, 0xC8, 0x40, 0xC7, 0x3F, 0xC8, 0x3F, 0xC9,
+ 0x41, 0xC6, 0x41, 0xC6, 0x4C, 0xBA, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x44, 0xC3,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x4F, 0xB6,
+ 0x51, 0xB4, 0x51, 0xB4, 0x65, 0x9E, 0x7F, 0x80, 0x49, 0xBC, 0x41, 0xC6, 0x41, 0xC6, 0x73, 0x8D,
+ 0x80, 0x80, 0x80, 0x80, 0x72, 0x8E, 0x41, 0xC6, 0x41, 0xC6, 0x4B, 0xBA, 0x7F, 0x80, 0x80, 0x7F,
+ 0x67, 0x9B, 0x40, 0xC7, 0x40, 0xC7, 0x3F, 0xC8, 0x43, 0xC3, 0x47, 0xBF, 0x43, 0xC3, 0x3F, 0xC8,
+ 0x41, 0xC6, 0x41, 0xC6, 0x4D, 0xB9, 0x82, 0x7C, 0x82, 0x7C, 0x82, 0x7C, 0x82, 0x7C, 0x44, 0xC2,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x78, 0x88,
+ 0x82, 0x7D, 0x82, 0x7D, 0x80, 0x7F, 0x80, 0x7F, 0x61, 0xA1, 0x41, 0xC6, 0x41, 0xC6, 0x5C, 0xA8,
+ 0x80, 0x7F, 0x80, 0x7F, 0x5A, 0xAA, 0x41, 0xC6, 0x41, 0xC6, 0x64, 0x9E, 0x80, 0x7F, 0x7D, 0x83,
+ 0x46, 0xC0, 0x41, 0xC6, 0x40, 0xC7, 0x54, 0xB0, 0x73, 0x8D, 0x7B, 0x84, 0x73, 0x8D, 0x56, 0xAF,
+ 0x41, 0xC6, 0x41, 0xC6, 0x48, 0xBE, 0x66, 0x9B, 0x66, 0x9B, 0x66, 0x9B, 0x66, 0x9B, 0x42, 0xC4,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x61, 0xA1,
+ 0x67, 0x9B, 0x67, 0x9B, 0x76, 0x8A, 0x80, 0x80, 0x79, 0x87, 0x42, 0xC5, 0x41, 0xC6, 0x46, 0xBF,
+ 0x7E, 0x82, 0x7D, 0x82, 0x45, 0xC1, 0x41, 0xC6, 0x43, 0xC2, 0x7B, 0x85, 0x80, 0x80, 0x6D, 0x94,
+ 0x40, 0xC7, 0x41, 0xC6, 0x4A, 0xBC, 0x7E, 0x82, 0x81, 0x7E, 0x80, 0x7F, 0x81, 0x7E, 0x7F, 0x80,
+ 0x41, 0xC6, 0x41, 0xC6, 0x41, 0xC6, 0x3E, 0xC9, 0x3E, 0xC9, 0x3E, 0xC9, 0x3E, 0xC9, 0x41, 0xC6,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x3F, 0xC8,
+ 0x3F, 0xC9, 0x3F, 0xC9, 0x66, 0x9D, 0x80, 0x80, 0x80, 0x7F, 0x54, 0xB0, 0x41, 0xC6, 0x40, 0xC7,
+ 0x6C, 0x96, 0x6A, 0x98, 0x41, 0xC6, 0x41, 0xC6, 0x58, 0xAC, 0x80, 0x7F, 0x80, 0x80, 0x64, 0x9F,
+ 0x41, 0xC6, 0x41, 0xC6, 0x5C, 0xA7, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F,
+ 0x41, 0xC6, 0x41, 0xC6, 0x42, 0xC5, 0x45, 0xC2, 0x45, 0xC2, 0x45, 0xC2, 0x45, 0xC2, 0x41, 0xC6,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x42, 0xC5,
+ 0x42, 0xC5, 0x42, 0xC5, 0x67, 0x9C, 0x80, 0x80, 0x80, 0x80, 0x6F, 0x93, 0x40, 0xC7, 0x41, 0xC6,
+ 0x54, 0xB1, 0x51, 0xB4, 0x41, 0xC6, 0x41, 0xC7, 0x72, 0x8F, 0x80, 0x80, 0x80, 0x80, 0x63, 0x9F,
+ 0x41, 0xC6, 0x41, 0xC6, 0x5C, 0xA7, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F,
+ 0x41, 0xC6, 0x41, 0xC6, 0x4B, 0xBA, 0x7A, 0x86, 0x7A, 0x86, 0x7A, 0x86, 0x7A, 0x86, 0x43, 0xC3,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x6A, 0x98,
+ 0x71, 0x90, 0x71, 0x90, 0x7A, 0x86, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x49, 0xBD, 0x41, 0xC6,
+ 0x42, 0xC5, 0x41, 0xC6, 0x41, 0xC6, 0x4B, 0xB9, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x6D, 0x95,
+ 0x40, 0xC7, 0x41, 0xC6, 0x4A, 0xBC, 0x7E, 0x82, 0x81, 0x7E, 0x80, 0x7F, 0x81, 0x7E, 0x7F, 0x80,
+ 0x41, 0xC6, 0x41, 0xC6, 0x4C, 0xB9, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7E, 0x44, 0xC3,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x78, 0x88,
+ 0x81, 0x7E, 0x81, 0x7E, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x61, 0xA2, 0x41, 0xC6,
+ 0x41, 0xC6, 0x41, 0xC6, 0x41, 0xC6, 0x66, 0x9C, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7D, 0x83,
+ 0x46, 0xC0, 0x41, 0xC6, 0x40, 0xC7, 0x54, 0xB0, 0x73, 0x8D, 0x7B, 0x84, 0x74, 0x8D, 0x56, 0xAE,
+ 0x41, 0xC6, 0x41, 0xC6, 0x4C, 0xBA, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x44, 0xC3,
+ 0x41, 0xC6, 0x49, 0xBC, 0x80, 0x80, 0x80, 0x80, 0x56, 0xAE, 0x41, 0xC6, 0x41, 0xC6, 0x4F, 0xB6,
+ 0x51, 0xB3, 0x51, 0xB3, 0x65, 0x9D, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x79, 0x87, 0x42, 0xC5,
+ 0x41, 0xC6, 0x41, 0xC6, 0x44, 0xC2, 0x7B, 0x84, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F,
+ 0x67, 0x9B, 0x40, 0xC7, 0x40, 0xC7, 0x3F, 0xC8, 0x43, 0xC3, 0x47, 0xBF, 0x43, 0xC3, 0x3F, 0xC8,
+ 0x40, 0xC8, 0x40, 0xC8, 0x4B, 0xBB, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x42, 0xC4,
+ 0x40, 0xC8, 0x48, 0xBD, 0x80, 0x80, 0x80, 0x80, 0x55, 0xAF, 0x40, 0xC8, 0x40, 0xC8, 0x3E, 0xCA,
+ 0x3E, 0xCA, 0x3E, 0xCA, 0x59, 0xAB, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x55, 0xB1,
+ 0x40, 0xC8, 0x40, 0xC8, 0x59, 0xAB, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x7F, 0x61, 0xA1, 0x42, 0xC4, 0x3F, 0xC8, 0x40, 0xC8, 0x40, 0xC7, 0x3F, 0xC8, 0x3F, 0xC9,
+ 0x4D, 0xB9, 0x4D, 0xB9, 0x56, 0xAF, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x4F, 0xB7,
+ 0x4D, 0xB9, 0x54, 0xB2, 0x80, 0x80, 0x80, 0x80, 0x5E, 0xA6, 0x4D, 0xB9, 0x4D, 0xB9, 0x4D, 0xB9,
+ 0x4D, 0xB9, 0x4D, 0xB9, 0x62, 0xA2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x70, 0x93,
+ 0x4D, 0xB9, 0x4D, 0xB9, 0x74, 0x8E, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x81, 0x7F, 0x70, 0x92, 0x55, 0xB0, 0x46, 0xC1, 0x42, 0xC5, 0x4A, 0xBD, 0x5B, 0xAA,
+};
+const UWORD8 gau1_ihevcd_codec_logo_420sp_vu[] =
+{
+ 0XB9, 0X4D, 0XB9, 0X4D, 0XAE, 0X56, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XB6, 0X4F,
+ 0XB9, 0X4D, 0XB1, 0X53, 0X80, 0X80, 0X80, 0X80, 0XA6, 0X5E, 0XB9, 0X4D, 0XB9, 0X4D, 0XB9, 0X4D,
+ 0XB9, 0X4D, 0XB9, 0X4D, 0XA1, 0X62, 0XA4, 0X5F, 0XB9, 0X4D, 0XB9, 0X4D, 0X9B, 0X68, 0X7F, 0X80,
+ 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80, 0X9C, 0X67, 0XB9, 0X4D, 0XB9, 0X4D, 0XA3, 0X61, 0X80, 0X80,
+ 0X7F, 0X80, 0X7E, 0X81, 0X92, 0X70, 0XAF, 0X56, 0XC1, 0X46, 0XC5, 0X42, 0XBD, 0X4A, 0XA9, 0X5B,
+ 0XC8, 0X40, 0XC8, 0X40, 0XBB, 0X4B, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC4, 0X42,
+ 0XC8, 0X40, 0XBD, 0X48, 0X80, 0X80, 0X80, 0X80, 0XAF, 0X55, 0XC8, 0X40, 0XC8, 0X40, 0XCA, 0X3E,
+ 0XCA, 0X3E, 0XCA, 0X3E, 0XAB, 0X59, 0X94, 0X6D, 0XC8, 0X3F, 0XC8, 0X40, 0XB9, 0X4C, 0X80, 0X7F,
+ 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0XBA, 0X4B, 0XC8, 0X40, 0XC8, 0X3F, 0X92, 0X70, 0X80, 0X80,
+ 0X7F, 0X80, 0XA1, 0X62, 0XC4, 0X42, 0XC9, 0X3F, 0XC8, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC9, 0X3F,
+ 0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4C, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC3, 0X44,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XB6, 0X4F,
+ 0XB4, 0X51, 0XB4, 0X51, 0X9E, 0X65, 0X80, 0X7F, 0XBC, 0X49, 0XC6, 0X41, 0XC6, 0X41, 0X8D, 0X73,
+ 0X80, 0X80, 0X80, 0X80, 0X8E, 0X72, 0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4B, 0X80, 0X7F, 0X7F, 0X80,
+ 0X9B, 0X67, 0XC7, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC3, 0X43, 0XBF, 0X47, 0XC3, 0X43, 0XC8, 0X3F,
+ 0XC6, 0X41, 0XC6, 0X41, 0XB9, 0X4D, 0X7C, 0X82, 0X7C, 0X82, 0X7C, 0X82, 0X7C, 0X82, 0XC2, 0X44,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0X88, 0X78,
+ 0X7D, 0X82, 0X7D, 0X82, 0X7F, 0X80, 0X7F, 0X80, 0XA1, 0X61, 0XC6, 0X41, 0XC6, 0X41, 0XA8, 0X5C,
+ 0X7F, 0X80, 0X7F, 0X80, 0XAA, 0X5A, 0XC6, 0X41, 0XC6, 0X41, 0X9E, 0X64, 0X7F, 0X80, 0X83, 0X7D,
+ 0XC0, 0X46, 0XC6, 0X41, 0XC7, 0X40, 0XB0, 0X54, 0X8D, 0X73, 0X84, 0X7B, 0X8D, 0X73, 0XAF, 0X56,
+ 0XC6, 0X41, 0XC6, 0X41, 0XBE, 0X48, 0X9B, 0X66, 0X9B, 0X66, 0X9B, 0X66, 0X9B, 0X66, 0XC4, 0X42,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XA1, 0X61,
+ 0X9B, 0X67, 0X9B, 0X67, 0X8A, 0X76, 0X80, 0X80, 0X87, 0X79, 0XC5, 0X42, 0XC6, 0X41, 0XBF, 0X46,
+ 0X82, 0X7E, 0X82, 0X7D, 0XC1, 0X45, 0XC6, 0X41, 0XC2, 0X43, 0X85, 0X7B, 0X80, 0X80, 0X94, 0X6D,
+ 0XC7, 0X40, 0XC6, 0X41, 0XBC, 0X4A, 0X82, 0X7E, 0X7E, 0X81, 0X7F, 0X80, 0X7E, 0X81, 0X80, 0X7F,
+ 0XC6, 0X41, 0XC6, 0X41, 0XC6, 0X41, 0XC9, 0X3E, 0XC9, 0X3E, 0XC9, 0X3E, 0XC9, 0X3E, 0XC6, 0X41,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XC8, 0X3F,
+ 0XC9, 0X3F, 0XC9, 0X3F, 0X9D, 0X66, 0X80, 0X80, 0X7F, 0X80, 0XB0, 0X54, 0XC6, 0X41, 0XC7, 0X40,
+ 0X96, 0X6C, 0X98, 0X6A, 0XC6, 0X41, 0XC6, 0X41, 0XAC, 0X58, 0X7F, 0X80, 0X80, 0X80, 0X9F, 0X64,
+ 0XC6, 0X41, 0XC6, 0X41, 0XA7, 0X5C, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80,
+ 0XC6, 0X41, 0XC6, 0X41, 0XC5, 0X42, 0XC2, 0X45, 0XC2, 0X45, 0XC2, 0X45, 0XC2, 0X45, 0XC6, 0X41,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XC5, 0X42,
+ 0XC5, 0X42, 0XC5, 0X42, 0X9C, 0X67, 0X80, 0X80, 0X80, 0X80, 0X93, 0X6F, 0XC7, 0X40, 0XC6, 0X41,
+ 0XB1, 0X54, 0XB4, 0X51, 0XC6, 0X41, 0XC7, 0X41, 0X8F, 0X72, 0X80, 0X80, 0X80, 0X80, 0X9F, 0X63,
+ 0XC6, 0X41, 0XC6, 0X41, 0XA7, 0X5C, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80,
+ 0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4B, 0X86, 0X7A, 0X86, 0X7A, 0X86, 0X7A, 0X86, 0X7A, 0XC3, 0X43,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0X98, 0X6A,
+ 0X90, 0X71, 0X90, 0X71, 0X86, 0X7A, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0XBD, 0X49, 0XC6, 0X41,
+ 0XC5, 0X42, 0XC6, 0X41, 0XC6, 0X41, 0XB9, 0X4B, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X95, 0X6D,
+ 0XC7, 0X40, 0XC6, 0X41, 0XBC, 0X4A, 0X82, 0X7E, 0X7E, 0X81, 0X7F, 0X80, 0X7E, 0X81, 0X80, 0X7F,
+ 0XC6, 0X41, 0XC6, 0X41, 0XB9, 0X4C, 0X7E, 0X81, 0X7E, 0X81, 0X7E, 0X81, 0X7E, 0X81, 0XC3, 0X44,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0X88, 0X78,
+ 0X7E, 0X81, 0X7E, 0X81, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80, 0XA2, 0X61, 0XC6, 0X41,
+ 0XC6, 0X41, 0XC6, 0X41, 0XC6, 0X41, 0X9C, 0X66, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X83, 0X7D,
+ 0XC0, 0X46, 0XC6, 0X41, 0XC7, 0X40, 0XB0, 0X54, 0X8D, 0X73, 0X84, 0X7B, 0X8D, 0X74, 0XAE, 0X56,
+ 0XC6, 0X41, 0XC6, 0X41, 0XBA, 0X4C, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC3, 0X44,
+ 0XC6, 0X41, 0XBC, 0X49, 0X80, 0X80, 0X80, 0X80, 0XAE, 0X56, 0XC6, 0X41, 0XC6, 0X41, 0XB6, 0X4F,
+ 0XB3, 0X51, 0XB3, 0X51, 0X9D, 0X65, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X87, 0X79, 0XC5, 0X42,
+ 0XC6, 0X41, 0XC6, 0X41, 0XC2, 0X44, 0X84, 0X7B, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80,
+ 0X9B, 0X67, 0XC7, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC3, 0X43, 0XBF, 0X47, 0XC3, 0X43, 0XC8, 0X3F,
+ 0XC8, 0X40, 0XC8, 0X40, 0XBB, 0X4B, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XC4, 0X42,
+ 0XC8, 0X40, 0XBD, 0X48, 0X80, 0X80, 0X80, 0X80, 0XAF, 0X55, 0XC8, 0X40, 0XC8, 0X40, 0XCA, 0X3E,
+ 0XCA, 0X3E, 0XCA, 0X3E, 0XAB, 0X59, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X7F, 0X80, 0XB1, 0X55,
+ 0XC8, 0X40, 0XC8, 0X40, 0XAB, 0X59, 0X7F, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80,
+ 0X7F, 0X80, 0XA1, 0X61, 0XC4, 0X42, 0XC8, 0X3F, 0XC8, 0X40, 0XC7, 0X40, 0XC8, 0X3F, 0XC9, 0X3F,
+ 0XB9, 0X4D, 0XB9, 0X4D, 0XAF, 0X56, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0XB7, 0X4F,
+ 0XB9, 0X4D, 0XB2, 0X54, 0X80, 0X80, 0X80, 0X80, 0XA6, 0X5E, 0XB9, 0X4D, 0XB9, 0X4D, 0XB9, 0X4D,
+ 0XB9, 0X4D, 0XB9, 0X4D, 0XA2, 0X62, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X93, 0X70,
+ 0XB9, 0X4D, 0XB9, 0X4D, 0X8E, 0X74, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80, 0X80,
+ 0X80, 0X80, 0X7F, 0X81, 0X92, 0X70, 0XB0, 0X55, 0XC1, 0X46, 0XC5, 0X42, 0XBD, 0X4A, 0XAA, 0X5B,
+};
+
+const UWORD8 gau1_ihevcd_logo_y[] =
+{
+ 0xfd, 0xfd, 0xfd, 0xfb, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfe, 0xfa, 0xfa, 0xfb, 0xfc, 0xfc, 0xfb, 0xfc,
+ 0xfd, 0xfc, 0xfa, 0xfd, 0xfc, 0xfb, 0xfb, 0xfd,
+ 0xfd, 0xfd, 0xfb, 0xfd, 0xfb, 0xfb, 0xfc, 0xfc,
+ 0xfa, 0xfa, 0xfb, 0xfd, 0xfb, 0xfd, 0xfd, 0xfc,
+ 0xfd, 0xfe, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfe, 0xfe, 0xfd, 0xfb, 0xfc, 0xfc,
+ 0xfc, 0xfd, 0xfc, 0xfc, 0xfb, 0xfc, 0xfc, 0xfe,
+ 0xfd, 0xfc, 0xfc, 0xfc, 0xfb, 0xfb, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfb, 0xfb, 0xfc, 0xfd, 0xfd,
+ 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfe, 0xfe,
+ 0xfd, 0xfe, 0xfd, 0xfd, 0xfe, 0xfc, 0xfc, 0xfc,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfe, 0xfe, 0xfd, 0xfc,
+ 0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfb, 0xfb, 0xfc,
+ 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+ 0xfd, 0xfb, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfb,
+ 0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc,
+ 0xfc, 0xfc, 0xfb, 0xfb, 0xfb, 0xfb, 0xfc, 0xfc,
+ 0xfd, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfe,
+ 0xfc, 0xfb, 0xfb, 0xf9, 0xfa, 0xfb, 0xfa, 0xfb,
+ 0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfb,
+ 0xfc, 0xfc, 0xfc, 0xfc, 0xfa, 0xfb, 0xfc, 0xfd,
+ 0xfc, 0xfb, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfc,
+ 0xfd, 0xfc, 0xf2, 0xde, 0xd9, 0xe9, 0xf5, 0xf8,
+ 0xfb, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfc, 0xfa, 0xfe, 0xfd, 0xfa, 0xf7, 0xf6, 0xf9,
+ 0xfa, 0xfc, 0xfc, 0xfa, 0xfc, 0xfc, 0xfc, 0xfa,
+ 0xf9, 0xfa, 0xfc, 0xfb, 0xfd, 0xfb, 0xfc, 0xfd,
+ 0xfc, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc, 0xfe, 0xfa,
+ 0xfd, 0xfa, 0xfa, 0xf3, 0xb3, 0x9f, 0x97, 0xaa,
+ 0xce, 0xf4, 0xfa, 0xfd, 0xfc, 0xfe, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfc, 0xfb, 0xfd, 0xf9, 0xdb, 0xd5,
+ 0xd6, 0xd7, 0xd7, 0xdb, 0xf7, 0xfc, 0xfd, 0xfd,
+ 0xfc, 0xfc, 0xfc, 0xfa, 0xf9, 0xfb, 0xfe, 0xfd,
+ 0xfc, 0xfd, 0xfe, 0xfb, 0xfb, 0xfa, 0xfc, 0xfc,
+ 0xfb, 0xfd, 0xfb, 0xf9, 0xf5, 0xba, 0x98, 0xa0,
+ 0xa3, 0x9b, 0x96, 0xde, 0xfc, 0xf8, 0xfc, 0xfd,
+ 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfe, 0xfe, 0xfd, 0xf5,
+ 0xab, 0x9a, 0xa0, 0x9f, 0x99, 0xab, 0xf0, 0xfa,
+ 0xfc, 0xfd, 0xfa, 0xf8, 0xfb, 0xfc, 0xfb, 0xfc,
+ 0xfb, 0xfc, 0xfa, 0xf9, 0xfb, 0xfd, 0xfb, 0xfd,
+ 0xfa, 0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xf0, 0xa9,
+ 0xa1, 0xa1, 0xa2, 0xa1, 0x99, 0xd6, 0xf9, 0xfa,
+ 0xfb, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfe,
+ 0xfb, 0xf4, 0xaa, 0x9f, 0xa4, 0xa3, 0x9e, 0xaf,
+ 0xf3, 0xfd, 0xfa, 0xfd, 0xf8, 0xf2, 0xf3, 0xf0,
+ 0xf2, 0xf2, 0xf8, 0xfb, 0xfe, 0xf8, 0xfa, 0xf5,
+ 0xf3, 0xf4, 0xf6, 0xf2, 0xf5, 0xfc, 0xfa, 0xfd,
+ 0xf1, 0xaa, 0x9d, 0xa0, 0xa0, 0x9f, 0x99, 0xd6,
+ 0xf8, 0xfb, 0xfa, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfc, 0xfe, 0xfa, 0xf4, 0xac, 0x9e, 0xa0, 0xa0,
+ 0x9f, 0xb0, 0xf3, 0xfa, 0xfb, 0xfc, 0xdb, 0xb8,
+ 0xb9, 0xb7, 0xb9, 0xb0, 0xe4, 0xf6, 0xfc, 0xfb,
+ 0xeb, 0xbf, 0xb8, 0xba, 0xb9, 0xb5, 0xc2, 0xfd,
+ 0xfb, 0xfc, 0xf7, 0xbe, 0x97, 0x9e, 0xa1, 0x9b,
+ 0x9e, 0xe1, 0xf9, 0xfb, 0xfa, 0xfb, 0xfd, 0xfc,
+ 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc,
+ 0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc,
+ 0xfc, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfd,
+ 0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+ 0xfc, 0xfd, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+ 0xfd, 0xfd, 0xfc, 0xfe, 0xfd, 0xf7, 0xac, 0x9e,
+ 0xa2, 0xa2, 0x9e, 0xaf, 0xf3, 0xfd, 0xfc, 0xfa,
+ 0xd3, 0x9b, 0x9d, 0x9f, 0x9b, 0x97, 0xd6, 0xf7,
+ 0xfc, 0xfa, 0xe3, 0xa5, 0x9a, 0xa0, 0x9d, 0x99,
+ 0xaf, 0xf6, 0xfc, 0xfd, 0xfc, 0xf6, 0xb4, 0xa5,
+ 0xa1, 0xac, 0xd4, 0xf7, 0xfa, 0xfc, 0xfc, 0xfd,
+ 0xfd, 0xfd, 0xfa, 0xfb, 0xfc, 0xfc, 0xfc, 0xfb,
+ 0xfb, 0xfb, 0xfd, 0xfa, 0xfd, 0xfc, 0xfa, 0xfa,
+ 0xfb, 0xfb, 0xfb, 0xfc, 0xfe, 0xfc, 0xfe, 0xfd,
+ 0xfb, 0xfc, 0xfd, 0xfd, 0xfa, 0xfa, 0xfa, 0xfc,
+ 0xfd, 0xfb, 0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd,
+ 0xfc, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfe, 0xf8,
+ 0xac, 0x9e, 0xa2, 0xa3, 0x9e, 0xb0, 0xf4, 0xfc,
+ 0xfd, 0xfd, 0xd6, 0x9f, 0xa0, 0xa1, 0xa0, 0x9b,
+ 0xda, 0xf7, 0xfa, 0xfa, 0xe5, 0xa8, 0xa0, 0x9f,
+ 0xa0, 0x9c, 0xb0, 0xf4, 0xfb, 0xfd, 0xfd, 0xf9,
+ 0xf2, 0xe0, 0xd9, 0xe7, 0xf7, 0xfb, 0xfa, 0xfd,
+ 0xfa, 0xfd, 0xfb, 0xfe, 0xfe, 0xfc, 0xfb, 0xfc,
+ 0xfc, 0xfc, 0xfb, 0xfb, 0xfc, 0xfa, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfb, 0xfe, 0xfb, 0xfa, 0xfc,
+ 0xfc, 0xfb, 0xfe, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc,
+ 0xfd, 0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc,
+ 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc, 0xfc,
+ 0xfe, 0xf7, 0xad, 0x9e, 0xa2, 0xa3, 0x9e, 0xaf,
+ 0xf3, 0xfc, 0xfc, 0xfd, 0xd5, 0xa1, 0xa2, 0xa0,
+ 0xa1, 0x9d, 0xdb, 0xf9, 0xf9, 0xfa, 0xe6, 0xa9,
+ 0x9e, 0xa2, 0xa2, 0x9e, 0xb1, 0xf6, 0xfd, 0xfb,
+ 0xfc, 0xfc, 0xf9, 0xf9, 0xf8, 0xf8, 0xfa, 0xfa,
+ 0xfa, 0xfc, 0xfb, 0xfd, 0xfc, 0xfa, 0xf9, 0xfa,
+ 0xf9, 0xf8, 0xfb, 0xfc, 0xfc, 0xfa, 0xfb, 0xfa,
+ 0xfb, 0xf9, 0xfb, 0xfa, 0xfc, 0xfc, 0xf9, 0xfb,
+ 0xfa, 0xfb, 0xfa, 0xf9, 0xfa, 0xfa, 0xfb, 0xfc,
+ 0xfc, 0xfb, 0xfb, 0xfa, 0xfc, 0xf7, 0xf8, 0xfa,
+ 0xfb, 0xfd, 0xfc, 0xfc, 0xfe, 0xfd, 0xfe, 0xfd,
+ 0xfc, 0xfd, 0xfe, 0xf8, 0xad, 0x9e, 0xa3, 0xa4,
+ 0x9f, 0xad, 0xf2, 0xfd, 0xfb, 0xf9, 0xd4, 0xa1,
+ 0xa1, 0xa0, 0xa2, 0x9d, 0xd8, 0xf7, 0xfb, 0xf8,
+ 0xe4, 0xab, 0xa1, 0xa4, 0xa3, 0x9e, 0xaf, 0xf3,
+ 0xfb, 0xfb, 0xfb, 0xfb, 0xf9, 0xf9, 0xf7, 0xf7,
+ 0xfa, 0xfa, 0xfb, 0xfd, 0xfc, 0xf9, 0xfd, 0xf9,
+ 0xf4, 0xe1, 0xda, 0xd8, 0xd9, 0xdd, 0xec, 0xf6,
+ 0xf5, 0xed, 0xef, 0xee, 0xee, 0xf0, 0xf0, 0xfb,
+ 0xf9, 0xef, 0xd8, 0xd6, 0xd8, 0xda, 0xe6, 0xf7,
+ 0xfc, 0xfc, 0xfc, 0xf9, 0xf3, 0xde, 0xd8, 0xd5,
+ 0xd7, 0xe7, 0xf7, 0xfb, 0xfb, 0xfe, 0xfe, 0xfd,
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfd, 0xf8, 0xad, 0x9e,
+ 0xa2, 0xa3, 0xa0, 0xaf, 0xef, 0xf0, 0xc0, 0xc2,
+ 0xb4, 0xa1, 0xa5, 0xa5, 0xa3, 0x9e, 0xb7, 0xc3,
+ 0xc4, 0xc4, 0xbe, 0xa6, 0xa3, 0xa2, 0xa3, 0xa0,
+ 0xa6, 0xc2, 0xc4, 0xc4, 0xc3, 0xc2, 0xc4, 0xc4,
+ 0xc1, 0xc2, 0xc3, 0xee, 0xfc, 0xfc, 0xfe, 0xfc,
+ 0xf5, 0xdd, 0xb7, 0xa3, 0x96, 0x98, 0x96, 0x97,
+ 0xaf, 0xc9, 0xdf, 0xb6, 0xb6, 0xb2, 0xb4, 0xb4,
+ 0xbd, 0xee, 0xd4, 0xb0, 0x9c, 0x98, 0x97, 0x96,
+ 0xa4, 0xbd, 0xf4, 0xf8, 0xf4, 0xd6, 0xb8, 0x9b,
+ 0x92, 0x98, 0x97, 0xa2, 0xc0, 0xea, 0xf8, 0xfe,
+ 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfe, 0xf7,
+ 0xab, 0x9f, 0xa2, 0xa2, 0x9f, 0xad, 0xeb, 0xe4,
+ 0x96, 0x9f, 0xa0, 0xa2, 0xa3, 0xa2, 0xa1, 0xa2,
+ 0x9f, 0x9e, 0x9d, 0x9c, 0x9e, 0xa2, 0xa0, 0xa1,
+ 0xa0, 0xa1, 0xa0, 0x9e, 0x9c, 0x9c, 0x9b, 0x9b,
+ 0x9b, 0x9c, 0x9b, 0x9a, 0x95, 0xdf, 0xfb, 0xfb,
+ 0xfb, 0xf0, 0xcc, 0x9f, 0xa0, 0x9f, 0x9d, 0x9f,
+ 0xa0, 0xa1, 0x9a, 0x9d, 0xb6, 0x9c, 0xa0, 0x9b,
+ 0x9f, 0x9e, 0xac, 0xc4, 0x9d, 0x9e, 0x9e, 0xa0,
+ 0xa1, 0xa0, 0x9e, 0x9c, 0xad, 0xe3, 0xce, 0xa5,
+ 0x9d, 0x9e, 0xa2, 0xa1, 0x9e, 0x9b, 0x9a, 0xb7,
+ 0xed, 0xfc, 0xfe, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+ 0xfd, 0xf7, 0xab, 0x9f, 0xa1, 0xa1, 0x9f, 0xaf,
+ 0xeb, 0xe5, 0x9c, 0xa2, 0xa2, 0xa2, 0xa2, 0xa2,
+ 0xa0, 0xa2, 0xa1, 0xa1, 0xa2, 0xa3, 0xa3, 0xa5,
+ 0xa3, 0xa4, 0xa1, 0xa1, 0xa3, 0xa3, 0xa4, 0xa5,
+ 0xa3, 0xa3, 0xa2, 0xa0, 0x9f, 0xa1, 0x9d, 0xe0,
+ 0xf9, 0xf8, 0xf9, 0xcd, 0x9b, 0x9f, 0xa0, 0xa2,
+ 0x9e, 0xa0, 0xa3, 0xa0, 0xa4, 0xa2, 0xa1, 0xa3,
+ 0xa5, 0xa3, 0xa5, 0xa1, 0xa5, 0xa0, 0xa0, 0xa0,
+ 0xa1, 0xa4, 0xa2, 0xa1, 0xa1, 0xa3, 0x9c, 0xb2,
+ 0xa8, 0x9c, 0xa2, 0xa1, 0xa3, 0xa3, 0xa4, 0xa0,
+ 0xa2, 0x9b, 0xdb, 0xfa, 0xfe, 0xfd, 0xfd, 0xfe,
+ 0xfc, 0xfd, 0xfe, 0xf8, 0xab, 0x9e, 0xa1, 0xa1,
+ 0x9e, 0xb0, 0xeb, 0xe7, 0x9c, 0xa1, 0xa0, 0xa2,
+ 0xa3, 0xa2, 0xa1, 0xa4, 0xa1, 0xa2, 0xa2, 0xa2,
+ 0xa2, 0xa3, 0xa3, 0xa2, 0xa0, 0xa2, 0xa2, 0xa1,
+ 0xa1, 0xa2, 0xa2, 0xa2, 0xa2, 0xa2, 0xa2, 0xa0,
+ 0x9b, 0xde, 0xf7, 0xf8, 0xed, 0x9d, 0xa0, 0xa0,
+ 0xa3, 0xa0, 0xa2, 0x9f, 0xa2, 0x9f, 0xa0, 0x9f,
+ 0xa5, 0xa1, 0xa3, 0xa2, 0xa4, 0xa2, 0xa2, 0xa1,
+ 0x9b, 0x9f, 0x9e, 0xa3, 0xa4, 0xa3, 0xa3, 0xa0,
+ 0xa6, 0x9f, 0xa2, 0x9e, 0x9a, 0x9e, 0xa0, 0xa3,
+ 0xa1, 0x9f, 0xa1, 0x9d, 0xbc, 0xfb, 0xfe, 0xff,
+ 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xf8, 0xac, 0x9f,
+ 0xa2, 0xa2, 0x9f, 0xab, 0xea, 0xe5, 0x94, 0x9c,
+ 0x9c, 0xa1, 0xa3, 0xa2, 0xa0, 0xa1, 0x9d, 0x99,
+ 0x9a, 0x98, 0x9c, 0xa1, 0xa0, 0xa1, 0xa1, 0xa2,
+ 0xa0, 0x9b, 0x9a, 0x9a, 0x9b, 0x9c, 0xa3, 0xa2,
+ 0xa2, 0x9f, 0x9c, 0xe1, 0xf6, 0xf6, 0xbb, 0x9b,
+ 0x9e, 0xa3, 0xa0, 0xa4, 0x9e, 0x9f, 0xb3, 0xc0,
+ 0xa9, 0x9b, 0xa0, 0xa2, 0xa0, 0xa2, 0xa2, 0xa3,
+ 0x9e, 0xa8, 0xcf, 0xdd, 0xd1, 0xa8, 0xa1, 0xa5,
+ 0xa4, 0xa4, 0x9e, 0xa4, 0x99, 0xc9, 0xd7, 0xcf,
+ 0xa5, 0x9e, 0xa1, 0xa1, 0xa1, 0x9e, 0xa4, 0xfc,
+ 0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xf8,
+ 0xac, 0x9f, 0xa3, 0xa3, 0x9e, 0xad, 0xf1, 0xf9,
+ 0xf6, 0xf3, 0xcd, 0x9f, 0xa3, 0xa0, 0xa2, 0x9c,
+ 0xd6, 0xf1, 0xf2, 0xf3, 0xe3, 0xa8, 0xa0, 0xa4,
+ 0xa4, 0x9e, 0xae, 0xf0, 0xf3, 0xf3, 0xf0, 0xb5,
+ 0x9e, 0xa2, 0xa2, 0x9f, 0x99, 0xe1, 0xf9, 0xe9,
+ 0xac, 0x9e, 0x9f, 0xa1, 0xa1, 0x9e, 0xb1, 0xe9,
+ 0xf6, 0xf7, 0xf1, 0xda, 0xa7, 0x9f, 0xa4, 0xa3,
+ 0xa0, 0xa2, 0x9a, 0xe3, 0xf7, 0xf5, 0xf6, 0xd2,
+ 0x9f, 0xa2, 0xa0, 0xa5, 0xa1, 0x9b, 0xd4, 0xf9,
+ 0xfa, 0xf8, 0xdc, 0x9c, 0xa1, 0xa3, 0xa2, 0x9f,
+ 0xa7, 0xfb, 0xff, 0xfe, 0xfe, 0xfe, 0xfd, 0xfc,
+ 0xfd, 0xf8, 0xac, 0x9f, 0xa3, 0xa4, 0x9e, 0xaf,
+ 0xf4, 0xfd, 0xfe, 0xfb, 0xd6, 0xa0, 0xa3, 0xa1,
+ 0xa0, 0x9c, 0xd9, 0xf8, 0xfb, 0xfa, 0xe6, 0xaa,
+ 0xa0, 0xa3, 0xa3, 0x9f, 0xb0, 0xf6, 0xf9, 0xfb,
+ 0xf4, 0xb9, 0x9d, 0xa1, 0xa3, 0x9f, 0x9a, 0xe1,
+ 0xf7, 0xda, 0xa3, 0xa1, 0xa1, 0xa1, 0xa0, 0xa5,
+ 0xe5, 0xf6, 0xfa, 0xf9, 0xf8, 0xf2, 0xd4, 0x9d,
+ 0xa4, 0xa3, 0xa3, 0xa2, 0x9b, 0xf1, 0xf8, 0xf9,
+ 0xfa, 0xe4, 0xaa, 0x9f, 0xa1, 0xa6, 0x9f, 0xaa,
+ 0xe5, 0xfa, 0xfe, 0xf9, 0xe5, 0xa5, 0x9e, 0xa4,
+ 0xa2, 0x9f, 0xa5, 0xfa, 0xfe, 0xfe, 0xfe, 0xfe,
+ 0xfd, 0xfc, 0xfd, 0xf7, 0xac, 0x9e, 0xa2, 0xa4,
+ 0x9f, 0xaf, 0xf5, 0xfd, 0xfd, 0xfd, 0xd6, 0x9e,
+ 0xa3, 0xa4, 0xa0, 0x9c, 0xdb, 0xfa, 0xfa, 0xfb,
+ 0xe8, 0xaa, 0xa0, 0xa4, 0xa2, 0x9e, 0xaf, 0xf6,
+ 0xfa, 0xfc, 0xf6, 0xb7, 0x9d, 0xa0, 0xa2, 0x9f,
+ 0x9d, 0xe1, 0xf6, 0xcd, 0x9f, 0xa3, 0xa3, 0xa2,
+ 0x9e, 0xbd, 0xf0, 0xf7, 0xfc, 0xfa, 0xfb, 0xfa,
+ 0xe7, 0xa3, 0xa2, 0xa2, 0xa4, 0xa3, 0xa1, 0xf5,
+ 0xf8, 0xfb, 0xfb, 0xed, 0xac, 0x9d, 0xa1, 0xa3,
+ 0x9e, 0xb6, 0xf0, 0xf9, 0xfc, 0xfc, 0xec, 0xaf,
+ 0x9e, 0xa5, 0xa1, 0x9f, 0xa5, 0xfb, 0xfe, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xf7, 0xac, 0x9e,
+ 0xa2, 0xa3, 0x9f, 0xaf, 0xf5, 0xfe, 0xfd, 0xfc,
+ 0xd3, 0x9f, 0xa1, 0xa2, 0xa0, 0x97, 0xda, 0xf8,
+ 0xf9, 0xf8, 0xe5, 0xa7, 0x9e, 0xa3, 0xa0, 0x9c,
+ 0xaf, 0xf7, 0xfa, 0xfd, 0xf5, 0xb6, 0x9e, 0xa1,
+ 0xa3, 0x9f, 0x99, 0xe2, 0xf5, 0xca, 0x9a, 0xa2,
+ 0xa3, 0xa3, 0xa0, 0xbf, 0xf5, 0xfb, 0xfb, 0xf8,
+ 0xfa, 0xf6, 0xe9, 0xa3, 0xa3, 0xa2, 0xa3, 0xa1,
+ 0xa9, 0xf5, 0xfa, 0xfc, 0xfb, 0xf1, 0xad, 0x9c,
+ 0xa3, 0xa4, 0x9d, 0xb9, 0xf0, 0xfb, 0xfc, 0xfb,
+ 0xec, 0xb3, 0x9d, 0xa3, 0xa2, 0x9e, 0xa5, 0xfb,
+ 0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xf7,
+ 0xad, 0x9e, 0xa2, 0xa3, 0x9e, 0xb0, 0xf5, 0xfd,
+ 0xfd, 0xfd, 0xd5, 0xa0, 0xa2, 0xa2, 0xa2, 0x9a,
+ 0xdb, 0xfb, 0xfb, 0xfa, 0xe8, 0xaa, 0xa0, 0xa4,
+ 0xa2, 0x9c, 0xb1, 0xf8, 0xfa, 0xfc, 0xf7, 0xb8,
+ 0x9e, 0xa2, 0xa3, 0x9f, 0x99, 0xe5, 0xf7, 0xd2,
+ 0xa1, 0xa2, 0xa2, 0xa4, 0x9f, 0xbe, 0xf0, 0xfa,
+ 0xfa, 0xf9, 0xfa, 0xf8, 0xe6, 0x9f, 0xa3, 0xa3,
+ 0xa2, 0xa2, 0xad, 0xf7, 0xfc, 0xfa, 0xfc, 0xf5,
+ 0xae, 0x9f, 0xa3, 0xa4, 0x9d, 0xb7, 0xed, 0xfd,
+ 0xfc, 0xfd, 0xec, 0xb3, 0x9f, 0xa4, 0xa2, 0x9e,
+ 0xa5, 0xfb, 0xfe, 0xfe, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfe, 0xf7, 0xad, 0x9f, 0xa2, 0xa3, 0x9f, 0xb0,
+ 0xf4, 0xfd, 0xfd, 0xfd, 0xd5, 0xa0, 0xa3, 0xa4,
+ 0xa0, 0x9b, 0xdb, 0xfa, 0xfc, 0xfb, 0xe8, 0xaa,
+ 0xa0, 0xa4, 0xa1, 0x9c, 0xb0, 0xf8, 0xfb, 0xfb,
+ 0xf7, 0xb8, 0x9e, 0xa2, 0xa4, 0xa0, 0x9a, 0xe5,
+ 0xf6, 0xdf, 0xa9, 0xa3, 0xa2, 0xa1, 0x9e, 0x9d,
+ 0xdc, 0xf5, 0xfa, 0xf7, 0xf7, 0xf2, 0xc3, 0x9d,
+ 0xa3, 0xa4, 0xa0, 0xa2, 0xac, 0xf6, 0xfb, 0xfa,
+ 0xfc, 0xf5, 0xad, 0x9d, 0xa3, 0xa3, 0x9d, 0xb5,
+ 0xef, 0xfc, 0xfc, 0xfc, 0xed, 0xb2, 0x9e, 0xa4,
+ 0xa1, 0x9f, 0xa6, 0xfb, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfc, 0xfd, 0xf8, 0xad, 0x9e, 0xa2, 0xa4,
+ 0x9f, 0xae, 0xf4, 0xfe, 0xfe, 0xfd, 0xd6, 0xa1,
+ 0xa3, 0xa4, 0xa1, 0x9b, 0xda, 0xf9, 0xfb, 0xfa,
+ 0xe8, 0xaa, 0x9f, 0xa3, 0xa1, 0x9d, 0xb0, 0xf9,
+ 0xf9, 0xfc, 0xf5, 0xb8, 0x9e, 0xa2, 0xa3, 0x9f,
+ 0x99, 0xe2, 0xfb, 0xf1, 0xaf, 0x9d, 0xa1, 0xa2,
+ 0x9f, 0xa5, 0xa4, 0xd5, 0xe6, 0xeb, 0xe4, 0xc3,
+ 0xa1, 0xa1, 0xa3, 0xa5, 0xa0, 0xa2, 0xab, 0xf4,
+ 0xfb, 0xfb, 0xfc, 0xf4, 0xac, 0x9d, 0xa1, 0xa3,
+ 0x9d, 0xb5, 0xef, 0xfc, 0xfb, 0xfc, 0xee, 0xb2,
+ 0x9f, 0xa3, 0xa1, 0x9f, 0xa6, 0xfb, 0xfe, 0xfd,
+ 0xfe, 0xfe, 0xfd, 0xfd, 0xfd, 0xf8, 0xad, 0x9e,
+ 0xa3, 0xa3, 0x9f, 0xae, 0xf5, 0xfe, 0xfd, 0xfd,
+ 0xd6, 0xa1, 0xa2, 0xa5, 0xa1, 0x9c, 0xdb, 0xfa,
+ 0xfc, 0xfa, 0xe8, 0xa9, 0x9f, 0xa4, 0xa2, 0x9d,
+ 0xb0, 0xf9, 0xfb, 0xfc, 0xf6, 0xb6, 0x9e, 0xa2,
+ 0xa2, 0x9f, 0x9b, 0xe3, 0xf9, 0xf6, 0xce, 0x99,
+ 0xa1, 0xa4, 0xa1, 0xa6, 0x9e, 0x9e, 0xac, 0xb1,
+ 0xa8, 0x9c, 0xa3, 0xa5, 0xa3, 0xa4, 0xa2, 0xa2,
+ 0xac, 0xf4, 0xf9, 0xfc, 0xfc, 0xf4, 0xad, 0x9d,
+ 0xa1, 0xa4, 0x9d, 0xb5, 0xf0, 0xfc, 0xfb, 0xfb,
+ 0xec, 0xb2, 0x9d, 0xa4, 0xa1, 0x9f, 0xa5, 0xfa,
+ 0xfe, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xf6,
+ 0xab, 0x9e, 0xa3, 0xa2, 0x9e, 0xae, 0xf4, 0xfe,
+ 0xfe, 0xfc, 0xd5, 0xa1, 0xa2, 0xa3, 0xa0, 0x9c,
+ 0xda, 0xfa, 0xfd, 0xf9, 0xe7, 0xaa, 0x9f, 0xa3,
+ 0xa2, 0x9c, 0xb1, 0xf8, 0xfa, 0xfb, 0xf7, 0xb6,
+ 0x9f, 0xa1, 0xa2, 0xa1, 0x9b, 0xe3, 0xfc, 0xfa,
+ 0xf4, 0xa4, 0xa0, 0xa2, 0xa0, 0xa1, 0xa3, 0x9d,
+ 0x9f, 0x9e, 0x9e, 0xa1, 0xa3, 0xa2, 0xa3, 0xa3,
+ 0xa1, 0xa1, 0xac, 0xf4, 0xf9, 0xfa, 0xfc, 0xf2,
+ 0xad, 0x9d, 0xa1, 0xa4, 0x9d, 0xb5, 0xee, 0xfc,
+ 0xfb, 0xfb, 0xed, 0xb2, 0x9f, 0xa5, 0xa2, 0x9e,
+ 0xa6, 0xfb, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc,
+ 0xf9, 0xf5, 0xab, 0x9e, 0xa1, 0xa0, 0x9f, 0xaf,
+ 0xf1, 0xfa, 0xfb, 0xfb, 0xd4, 0x9f, 0xa2, 0xa0,
+ 0xa0, 0x9a, 0xd9, 0xf4, 0xfd, 0xf9, 0xe4, 0xa8,
+ 0x9f, 0xa2, 0xa2, 0x9b, 0xb0, 0xf6, 0xfa, 0xfa,
+ 0xf3, 0xb7, 0x9c, 0xa1, 0xa1, 0x9f, 0x9d, 0xe0,
+ 0xfa, 0xfc, 0xfa, 0xdf, 0x9f, 0x9e, 0x9f, 0x9f,
+ 0xa0, 0xa4, 0xa3, 0xa2, 0xa0, 0xa0, 0xa2, 0xa3,
+ 0xa0, 0xa1, 0xa1, 0x9d, 0xad, 0xf4, 0xfa, 0xf9,
+ 0xf7, 0xef, 0xb0, 0x9b, 0xa0, 0xa0, 0x9a, 0xb6,
+ 0xed, 0xf9, 0xfd, 0xf7, 0xea, 0xb2, 0x9d, 0xa3,
+ 0xa1, 0x9f, 0xa7, 0xf7, 0xfb, 0xfd, 0xfe, 0xfe,
+ 0xff, 0xfe, 0xfa, 0xf4, 0xa8, 0x9f, 0xa2, 0xa3,
+ 0x9d, 0xad, 0xf1, 0xfb, 0xfb, 0xfc, 0xd3, 0x9e,
+ 0xa3, 0xa3, 0xa1, 0x98, 0xd8, 0xf8, 0xfc, 0xf9,
+ 0xe4, 0xaa, 0x9f, 0xa2, 0xa2, 0x9b, 0xac, 0xf7,
+ 0xfb, 0xf9, 0xf6, 0xb5, 0x9d, 0xa1, 0xa1, 0x9e,
+ 0x9a, 0xe0, 0xf9, 0xfb, 0xfb, 0xf7, 0xde, 0xae,
+ 0x98, 0xa4, 0xa2, 0xa4, 0xa2, 0xa1, 0x9c, 0xa4,
+ 0xc7, 0x9d, 0x9b, 0x9e, 0x9e, 0x9d, 0xa9, 0xf7,
+ 0xfb, 0xfa, 0xfa, 0xf3, 0xac, 0x9a, 0xa1, 0x9f,
+ 0x96, 0xb1, 0xee, 0xfc, 0xf8, 0xfc, 0xe9, 0xad,
+ 0x9b, 0xa0, 0x9d, 0x9b, 0x9e, 0xf9, 0xfc, 0xfd,
+ 0xfe, 0xfe, 0xfe, 0xfe, 0xfa, 0xf6, 0xb3, 0xa4,
+ 0xaa, 0xa8, 0xa4, 0xb5, 0xf3, 0xfd, 0xfc, 0xfa,
+ 0xd7, 0xa5, 0xa7, 0xa8, 0xa7, 0xa5, 0xdc, 0xfb,
+ 0xfc, 0xf9, 0xe6, 0xb2, 0xa6, 0xa9, 0xa6, 0xa3,
+ 0xb5, 0xf6, 0xfb, 0xfa, 0xf5, 0xbc, 0xa3, 0xa6,
+ 0xa7, 0xa5, 0xa3, 0xe4, 0xfb, 0xfe, 0xfa, 0xf9,
+ 0xfa, 0xee, 0xcf, 0xb1, 0xa0, 0x9c, 0x9c, 0x9f,
+ 0xbf, 0xe0, 0xef, 0xc5, 0xc9, 0xc7, 0xc6, 0xc8,
+ 0xcd, 0xfb, 0xfc, 0xfb, 0xfc, 0xfb, 0xcf, 0xc6,
+ 0xc4, 0xca, 0xc7, 0xd6, 0xf6, 0xfc, 0xfb, 0xfc,
+ 0xf3, 0xd2, 0xc6, 0xc7, 0xc5, 0xc7, 0xcd, 0xfb,
+ 0xfd, 0xfe, 0xfe, 0xfe, 0xfd, 0xfd, 0xfe, 0xfb,
+ 0xec, 0xe7, 0xe8, 0xe9, 0xeb, 0xeb, 0xfa, 0xfb,
+ 0xfc, 0xfc, 0xf7, 0xe9, 0xea, 0xe7, 0xe7, 0xe7,
+ 0xf5, 0xfd, 0xfb, 0xfb, 0xf8, 0xe8, 0xe6, 0xe6,
+ 0xe7, 0xe9, 0xea, 0xfc, 0xf8, 0xfa, 0xfb, 0xf1,
+ 0xe9, 0xea, 0xe9, 0xe7, 0xea, 0xf8, 0xfc, 0xfd,
+ 0xfc, 0xfb, 0xfb, 0xfa, 0xf6, 0xef, 0xe7, 0xe5,
+ 0xe3, 0xe6, 0xf5, 0xf7, 0xfb, 0xfb, 0xfa, 0xfb,
+ 0xfa, 0xfb, 0xfd, 0xfd, 0xfe, 0xfb, 0xfb, 0xfc,
+ 0xfa, 0xfb, 0xf9, 0xf9, 0xf7, 0xf9, 0xfc, 0xfc,
+ 0xfa, 0xfa, 0xfb, 0xfa, 0xf7, 0xf8, 0xf9, 0xfb,
+ 0xfc, 0xfc, 0xfc, 0xfd, 0xfb, 0xfb, 0xfd, 0xfb,
+ 0xfc, 0xfb, 0xfb, 0xfa, 0xf9, 0xfa, 0xfc, 0xfa,
+ 0xfb, 0xfd, 0xfb, 0xfb, 0xfc, 0xfa, 0xfb, 0xfc,
+ 0xf9, 0xfb, 0xfc, 0xfb, 0xfd, 0xfc, 0xfd, 0xfa,
+ 0xfa, 0xfd, 0xf9, 0xf9, 0xfa, 0xfb, 0xfb, 0xfd,
+ 0xfa, 0xfd, 0xfe, 0xfd, 0xfa, 0xfa, 0xfb, 0xfc,
+ 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfd, 0xfb, 0xfc,
+ 0xfb, 0xfc, 0xfb, 0xfc, 0xfb, 0xfe, 0xfc, 0xfc,
+ 0xfc, 0xff, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd,
+ 0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfe,
+ 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc,
+ 0xfd, 0xff, 0xfe, 0xfd, 0xfd, 0xfc, 0xfb, 0xfb,
+ 0xfc, 0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfd,
+ 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd,
+ 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfb,
+ 0xfb, 0xfd, 0xfd, 0xfe, 0xfe, 0xfe, 0xfd, 0xfd,
+ 0xfe, 0xfc, 0xfd, 0xfd, 0xfd, 0xfb, 0xfa, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfc, 0xfb, 0xfb, 0xfc, 0xfc,
+ 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xff, 0xfe, 0xfc,
+ 0xfd, 0xfc, 0xfd, 0xfe, 0xfd, 0xfd, 0xfe, 0xfe,
+ 0xfe, 0xfd, 0xfd, 0xfb, 0xfd, 0xfb, 0xfc, 0xfc,
+ 0xfb, 0xfd, 0xfd, 0xff, 0xfe, 0xfe, 0xfd, 0xfd,
+ 0xfc, 0xfb, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfd,
+ 0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfc, 0xfd,
+ 0xfc, 0xfc, 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc,
+ 0xfc, 0xfc, 0xfb, 0xfd, 0xfd, 0xfe, 0xfd, 0xfe,
+ 0xfd, 0xfd, 0xfe, 0xfd, 0xfc, 0xfc, 0xfd, 0xfc,
+ 0xfb, 0xfc, 0xfd, 0xfd, 0xfd, 0xfb, 0xfc, 0xfb,
+ 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfb, 0xfb, 0xfc,
+ 0xfc, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd,
+ 0xfd, 0xfd, 0xfd, 0xfc, 0xfc, 0xfb, 0xfb, 0xfc,
+ 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfd, 0xfd,
+ 0xfb, 0xfc, 0xfc, 0xfb, 0xfd, 0xfc, 0xfc, 0xfc,
+ 0xfd, 0xfd, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfc,
+ 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd,
+ 0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfb, 0xfc, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd,
+ 0xfb, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc,
+ 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfd, 0xfd, 0xfc,
+ 0xfb, 0xfb, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc,
+ 0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc,
+ 0xfc, 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
+ 0xfc, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc, 0xfd, 0xfd,
+ 0xfc, 0xfc, 0xfb, 0xfc, 0xfd, 0xfc, 0xfc, 0xfc,
+ 0xfc, 0xfb, 0xfd, 0xfc, 0xfc, 0xfb, 0xfc, 0xfc,
+ 0xfc, 0xfc, 0xfd, 0xfd, 0xfc, 0xfd, 0xfc, 0xfc,
+ 0xfd, 0xfd, 0xfc, 0xfd, 0xfd, 0xfd, 0xfd, 0xfd,
+ 0xfd, 0xfc, 0xfd, 0xfb, 0xfb, 0xfd, 0xfc, 0xfc,
+ 0xfd, 0xfd, 0xfb, 0xfd, 0xfe, 0xfe, 0xfd, 0xfd,
+ 0xfd, 0xfc, 0xfc, 0xfc, 0xfc, 0xfd, 0xfc, 0xfb,
+ 0xfd, 0xfd, 0xfc, 0xfc, 0xfc, 0xfb, 0xfc, 0xfb,
+ 0xfd, 0xfe, 0xfe, 0xfd, 0xfb, 0xfb, 0xfc, 0xfd,
+ 0xfd, 0xfc, 0xfd, 0xfd, 0xfc, 0xfc, 0xfb, 0xfa,
+ 0xfc, 0xfb, 0xfb, 0xfb, 0xfd, 0xfd, 0xfc, 0xfb,
+ 0xfc, 0xfc, 0xfc, 0xfc, 0xfb, 0xfd, 0xfb, 0xfa,
+
+};
+const UWORD8 gau1_ihevcd_logo_420p_u[] =
+{
+ 0x7F, 0x7D, 0x7F, 0x80, 0x7D, 0x7E, 0x7D, 0x82,
+ 0x80, 0x81, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7E,
+ 0x7B, 0x7D, 0x7D, 0x81, 0x7E, 0x7D, 0x80, 0x80,
+ 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80,
+ 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x7F,
+ 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x81, 0x81, 0x80,
+ 0x81, 0x81, 0x81, 0x7F, 0x7D, 0x7E, 0x81, 0x7D,
+ 0x7B, 0x7E, 0x7F, 0x7E, 0x80, 0x80, 0x80, 0x78,
+ 0x78, 0x7C, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x7F, 0x7D, 0x7C, 0x6E, 0x69, 0x70, 0x7B,
+ 0x7E, 0x7D, 0x82, 0x82, 0x7F, 0x80, 0x80, 0x82,
+ 0x7E, 0x80, 0x80, 0x71, 0x4B, 0x4A, 0x64, 0x7A,
+ 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80,
+ 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F,
+ 0x77, 0x54, 0x43, 0x53, 0x76, 0x7F, 0x75, 0x75,
+ 0x77, 0x7E, 0x80, 0x7A, 0x78, 0x74, 0x7A, 0x7E,
+ 0x66, 0x39, 0x34, 0x57, 0x79, 0x7F, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F,
+ 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x79, 0x49, 0x39,
+ 0x4A, 0x77, 0x7D, 0x5C, 0x4F, 0x55, 0x6F, 0x79,
+ 0x64, 0x52, 0x52, 0x6B, 0x80, 0x70, 0x4F, 0x47,
+ 0x6A, 0x7B, 0x7E, 0x7E, 0x80, 0x7F, 0x7D, 0x7F,
+ 0x7E, 0x7F, 0x7D, 0x7E, 0x7E, 0x7D, 0x7E, 0x80,
+ 0x80, 0x7F, 0x7E, 0x81, 0x80, 0x7D, 0x7D, 0x80,
+ 0x7F, 0x7E, 0x7A, 0x48, 0x3A, 0x45, 0x78, 0x7E,
+ 0x55, 0x3B, 0x42, 0x6D, 0x7F, 0x5C, 0x3D, 0x40,
+ 0x64, 0x80, 0x7E, 0x76, 0x6C, 0x7F, 0x7D, 0x80,
+ 0x7E, 0x7A, 0x7B, 0x7C, 0x80, 0x80, 0x80, 0x7E,
+ 0x80, 0x7F, 0x79, 0x7B, 0x80, 0x80, 0x80, 0x80,
+ 0x7A, 0x79, 0x7F, 0x81, 0x80, 0x7F, 0x7E, 0x7A,
+ 0x48, 0x3A, 0x45, 0x74, 0x6D, 0x4E, 0x3B, 0x3F,
+ 0x5F, 0x6B, 0x51, 0x3B, 0x3E, 0x58, 0x6B, 0x6A,
+ 0x6C, 0x68, 0x76, 0x81, 0x7F, 0x74, 0x60, 0x57,
+ 0x5C, 0x66, 0x67, 0x65, 0x63, 0x69, 0x65, 0x59,
+ 0x5A, 0x65, 0x75, 0x6F, 0x62, 0x57, 0x5B, 0x6D,
+ 0x7A, 0x81, 0x7D, 0x7F, 0x79, 0x49, 0x3A, 0x48,
+ 0x6D, 0x44, 0x3E, 0x3E, 0x3D, 0x3C, 0x3C, 0x3B,
+ 0x3A, 0x3B, 0x3F, 0x3F, 0x3E, 0x3A, 0x3D, 0x5B,
+ 0x7C, 0x6E, 0x54, 0x3E, 0x34, 0x39, 0x3D, 0x3D,
+ 0x3D, 0x3C, 0x43, 0x44, 0x3B, 0x38, 0x3D, 0x56,
+ 0x49, 0x3B, 0x3D, 0x3B, 0x49, 0x6E, 0x7F, 0x80,
+ 0x7E, 0x79, 0x49, 0x3A, 0x45, 0x6B, 0x47, 0x43,
+ 0x3A, 0x3C, 0x40, 0x42, 0x3D, 0x39, 0x3D, 0x40,
+ 0x40, 0x40, 0x3F, 0x3F, 0x54, 0x7A, 0x56, 0x3B,
+ 0x3B, 0x42, 0x48, 0x42, 0x3F, 0x3D, 0x3C, 0x45,
+ 0x51, 0x4D, 0x41, 0x3B, 0x3B, 0x4B, 0x52, 0x43,
+ 0x35, 0x45, 0x6D, 0x7E, 0x7E, 0x7E, 0x7A, 0x49,
+ 0x3A, 0x46, 0x77, 0x76, 0x4E, 0x3B, 0x42, 0x62,
+ 0x77, 0x56, 0x3A, 0x3E, 0x5F, 0x75, 0x5F, 0x3C,
+ 0x3B, 0x59, 0x6B, 0x46, 0x39, 0x46, 0x66, 0x75,
+ 0x6D, 0x51, 0x3D, 0x3C, 0x58, 0x7A, 0x6E, 0x44,
+ 0x39, 0x43, 0x6F, 0x77, 0x5B, 0x3B, 0x3C, 0x66,
+ 0x7F, 0x7F, 0x7E, 0x7A, 0x48, 0x3A, 0x45, 0x7A,
+ 0x7B, 0x4E, 0x3C, 0x44, 0x68, 0x7C, 0x5C, 0x3B,
+ 0x40, 0x63, 0x7D, 0x65, 0x3B, 0x3E, 0x5B, 0x65,
+ 0x3F, 0x3A, 0x4D, 0x72, 0x81, 0x7C, 0x5A, 0x3C,
+ 0x3D, 0x5C, 0x7E, 0x71, 0x46, 0x3B, 0x46, 0x75,
+ 0x7C, 0x61, 0x3C, 0x3C, 0x66, 0x7F, 0x7F, 0x7E,
+ 0x7A, 0x49, 0x3B, 0x47, 0x7B, 0x7B, 0x50, 0x3D,
+ 0x46, 0x69, 0x7F, 0x5D, 0x3E, 0x40, 0x64, 0x7F,
+ 0x67, 0x3D, 0x3D, 0x5B, 0x69, 0x43, 0x3D, 0x4B,
+ 0x6D, 0x7C, 0x78, 0x58, 0x3D, 0x3E, 0x5E, 0x7F,
+ 0x73, 0x48, 0x3E, 0x4B, 0x76, 0x7E, 0x63, 0x3D,
+ 0x3C, 0x67, 0x7F, 0x7F, 0x7E, 0x79, 0x48, 0x3A,
+ 0x46, 0x7B, 0x7C, 0x51, 0x3E, 0x45, 0x68, 0x7D,
+ 0x5D, 0x3B, 0x3F, 0x64, 0x7E, 0x68, 0x3E, 0x38,
+ 0x59, 0x74, 0x51, 0x3D, 0x3F, 0x50, 0x5A, 0x53,
+ 0x43, 0x3C, 0x3F, 0x5D, 0x7F, 0x72, 0x48, 0x3C,
+ 0x4A, 0x77, 0x7E, 0x62, 0x3E, 0x3C, 0x67, 0x80,
+ 0x80, 0x7F, 0x79, 0x4A, 0x38, 0x49, 0x79, 0x7C,
+ 0x51, 0x3A, 0x40, 0x67, 0x7B, 0x5A, 0x39, 0x3C,
+ 0x63, 0x7E, 0x66, 0x3D, 0x37, 0x5A, 0x7B, 0x65,
+ 0x47, 0x38, 0x39, 0x3F, 0x3B, 0x3B, 0x3B, 0x3D,
+ 0x5A, 0x7E, 0x71, 0x46, 0x3B, 0x47, 0x74, 0x7E,
+ 0x5E, 0x3A, 0x3E, 0x68, 0x7F, 0x7E, 0x80, 0x7A,
+ 0x58, 0x4A, 0x5A, 0x79, 0x7E, 0x5D, 0x49, 0x4F,
+ 0x6E, 0x7C, 0x64, 0x4D, 0x4F, 0x6B, 0x80, 0x6E,
+ 0x4E, 0x4A, 0x66, 0x7C, 0x7A, 0x6B, 0x54, 0x48,
+ 0x4A, 0x57, 0x5D, 0x57, 0x58, 0x6C, 0x80, 0x78,
+ 0x5A, 0x55, 0x5D, 0x79, 0x81, 0x6A, 0x53, 0x59,
+ 0x72, 0x7E, 0x7F, 0x80, 0x7E, 0x73, 0x6F, 0x76,
+ 0x80, 0x7F, 0x75, 0x72, 0x70, 0x7A, 0x7F, 0x78,
+ 0x72, 0x72, 0x77, 0x80, 0x7D, 0x75, 0x6F, 0x79,
+ 0x7F, 0x80, 0x80, 0x79, 0x72, 0x6D, 0x79, 0x7E,
+ 0x7C, 0x7A, 0x7D, 0x81, 0x80, 0x77, 0x78, 0x7D,
+ 0x7F, 0x7F, 0x7D, 0x7C, 0x7B, 0x7C, 0x7E, 0x81,
+ 0x7F, 0x80, 0x7F, 0x80, 0x81, 0x81, 0x81, 0x81,
+ 0x80, 0x80, 0x81, 0x81, 0x81, 0x81, 0x7E, 0x80,
+ 0x80, 0x81, 0x81, 0x80, 0x7F, 0x80, 0x7C, 0x7B,
+ 0x7F, 0x80, 0x81, 0x7E, 0x81, 0x7F, 0x7D, 0x7E,
+ 0x7C, 0x7F, 0x80, 0x81, 0x7F, 0x7E, 0x7C, 0x7F,
+ 0x81, 0x80, 0x7C, 0x7B, 0x7D, 0x7E, 0x7E, 0x7F,
+ 0x80, 0x7E, 0x7E, 0x7F, 0x81, 0x7E, 0x80, 0x7F,
+ 0x7F, 0x7D, 0x7C, 0x7F, 0x82, 0x7E, 0x7E, 0x7E,
+ 0x80, 0x7C, 0x7C, 0x7F, 0x80, 0x7D, 0x7D, 0x7F,
+ 0x7F, 0x82, 0x7F, 0x7E, 0x82, 0x7E, 0x7F, 0x80,
+ 0x7F, 0x7D, 0x80, 0x82, 0x80, 0x7C, 0x7E, 0x7F,
+ 0x7F, 0x81,
+};
+
+const UWORD8 gau1_ihevcd_logo_420p_v[] =
+{
+ 0x7E, 0x80, 0x7D, 0x7E, 0x80, 0x81, 0x7E, 0x7C,
+ 0x80, 0x81, 0x7E, 0x7D, 0x80, 0x81, 0x7C, 0x81,
+ 0x81, 0x7F, 0x81, 0x7E, 0x80, 0x7F, 0x7C, 0x7F,
+ 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F,
+ 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x7C, 0x7B, 0x7F, 0x81, 0x7F, 0x7F, 0x81,
+ 0x82, 0x81, 0x80, 0x81, 0x7F, 0x7D, 0x7E, 0x86,
+ 0x87, 0x83, 0x81, 0x7F, 0x7F, 0x80, 0x80, 0x80,
+ 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80,
+ 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80,
+ 0x80, 0x7F, 0x81, 0x81, 0x8C, 0x8D, 0x89, 0x82,
+ 0x7F, 0x81, 0x7A, 0x7C, 0x7E, 0x7F, 0x7C, 0x7C,
+ 0x80, 0x7F, 0x7F, 0x8C, 0xAE, 0xAE, 0x94, 0x80,
+ 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80,
+ 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7E,
+ 0x82, 0xA8, 0xB5, 0xA5, 0x83, 0x7D, 0x84, 0x83,
+ 0x83, 0x81, 0x7F, 0x81, 0x83, 0x84, 0x80, 0x7C,
+ 0x94, 0xBC, 0xC0, 0x9E, 0x82, 0x80, 0x80, 0x7F,
+ 0x80, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80,
+ 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x82, 0xB3, 0xC2,
+ 0xAD, 0x83, 0x7F, 0x9C, 0xAD, 0xA5, 0x89, 0x81,
+ 0x96, 0xA8, 0xA3, 0x8C, 0x7D, 0x87, 0xAD, 0xB1,
+ 0x93, 0x82, 0x80, 0x7F, 0x7E, 0x7F, 0x80, 0x7F,
+ 0x80, 0x7F, 0x7F, 0x81, 0x80, 0x80, 0x81, 0x81,
+ 0x80, 0x81, 0x82, 0x7C, 0x7D, 0x82, 0x81, 0x80,
+ 0x7F, 0x7F, 0x83, 0xB2, 0xBF, 0xB0, 0x82, 0x7F,
+ 0xA1, 0xC0, 0xB7, 0x88, 0x7A, 0x9E, 0xC0, 0xB9,
+ 0x93, 0x80, 0x80, 0x86, 0x8B, 0x7E, 0x7E, 0x7F,
+ 0x7F, 0x81, 0x80, 0x7E, 0x7E, 0x7F, 0x81, 0x80,
+ 0x7F, 0x7D, 0x81, 0x81, 0x80, 0x7E, 0x7D, 0x7D,
+ 0x81, 0x7F, 0x7F, 0x7E, 0x7F, 0x80, 0x7F, 0x82,
+ 0xB2, 0xBE, 0xB1, 0x85, 0x8E, 0xAA, 0xC0, 0xBB,
+ 0x96, 0x8E, 0xA6, 0xBE, 0xB8, 0x9D, 0x8E, 0x8E,
+ 0x8C, 0x90, 0x84, 0x7C, 0x81, 0x87, 0x98, 0xA1,
+ 0x9A, 0x92, 0x91, 0x96, 0x97, 0x8D, 0x90, 0xA0,
+ 0x9E, 0x95, 0x82, 0x88, 0x96, 0xA1, 0x9C, 0x8B,
+ 0x81, 0x7E, 0x80, 0x7F, 0x83, 0xB2, 0xC0, 0xB0,
+ 0x8B, 0xB7, 0xBE, 0xBE, 0xBF, 0xBE, 0xBE, 0xBD,
+ 0xBF, 0xC2, 0xBF, 0xBA, 0xBE, 0xBE, 0xBD, 0x9B,
+ 0x82, 0x8A, 0xA7, 0xBB, 0xC3, 0xBF, 0xB9, 0xB8,
+ 0xB9, 0xBB, 0xB2, 0xB2, 0xBE, 0xC0, 0xBB, 0x9F,
+ 0xAD, 0xBA, 0xBD, 0xBD, 0xAF, 0x89, 0x7E, 0x7D,
+ 0x7F, 0x82, 0xB2, 0xC0, 0xB3, 0x8B, 0xB3, 0xBA,
+ 0xC0, 0xC0, 0xBB, 0xB9, 0xBB, 0xBF, 0xC0, 0xBC,
+ 0xB6, 0xBA, 0xBB, 0xBB, 0x9E, 0x84, 0xA2, 0xBE,
+ 0xBE, 0xB8, 0xB0, 0xB9, 0xBD, 0xBE, 0xBF, 0xB3,
+ 0xA4, 0xAC, 0xBA, 0xBF, 0xBC, 0xAA, 0xA4, 0xB8,
+ 0xC2, 0xB5, 0x8D, 0x80, 0x81, 0x7E, 0x83, 0xB1,
+ 0xBE, 0xB1, 0x82, 0x84, 0xAA, 0xC0, 0xBA, 0x8C,
+ 0x85, 0x9F, 0xBF, 0xBC, 0x95, 0x85, 0x97, 0xBD,
+ 0xBD, 0x9E, 0x8A, 0xB4, 0xC2, 0xB4, 0x8F, 0x83,
+ 0x89, 0xA8, 0xC1, 0xC1, 0xA1, 0x85, 0x8B, 0xB5,
+ 0xC1, 0xB5, 0x8A, 0x81, 0x9C, 0xBF, 0xBC, 0x8F,
+ 0x80, 0x7E, 0x7F, 0x82, 0xB1, 0xBF, 0xB1, 0x82,
+ 0x7F, 0xA9, 0xC0, 0xB8, 0x89, 0x83, 0x9D, 0xBF,
+ 0xBD, 0x93, 0x81, 0x94, 0xBE, 0xBA, 0x9C, 0x8F,
+ 0xB5, 0xBD, 0xAB, 0x85, 0x7C, 0x7E, 0x9F, 0xBF,
+ 0xBE, 0x9C, 0x81, 0x86, 0xB2, 0xBE, 0xB1, 0x86,
+ 0x80, 0x98, 0xBE, 0xBC, 0x91, 0x7F, 0x7E, 0x7F,
+ 0x82, 0xB1, 0xBF, 0xB1, 0x81, 0x81, 0xA9, 0xC0,
+ 0xB7, 0x89, 0x81, 0x9C, 0xBD, 0xBB, 0x91, 0x80,
+ 0x93, 0xBD, 0xBA, 0x9C, 0x8E, 0xB7, 0xC1, 0xB0,
+ 0x89, 0x7C, 0x82, 0xA3, 0xBE, 0xBC, 0x9A, 0x82,
+ 0x82, 0xAF, 0xBF, 0xAE, 0x86, 0x7E, 0x98, 0xBC,
+ 0xBD, 0x90, 0x7F, 0x7F, 0x7F, 0x82, 0xB1, 0xBE,
+ 0xB2, 0x81, 0x80, 0xA9, 0xBF, 0xB7, 0x89, 0x82,
+ 0x9C, 0xBC, 0xBA, 0x90, 0x80, 0x92, 0xBA, 0xC0,
+ 0x9F, 0x84, 0xA8, 0xC0, 0xBD, 0xA5, 0x9C, 0xA3,
+ 0xB5, 0xBF, 0xBB, 0x9C, 0x81, 0x84, 0xAE, 0xC0,
+ 0xB0, 0x85, 0x7F, 0x97, 0xBD, 0xBC, 0x91, 0x7F,
+ 0x7F, 0x7E, 0x82, 0xB3, 0xC1, 0xB1, 0x84, 0x7E,
+ 0xA7, 0xC0, 0xB7, 0x8B, 0x80, 0x9E, 0xBF, 0xBA,
+ 0x93, 0x7E, 0x93, 0xBB, 0xC2, 0x9F, 0x80, 0x92,
+ 0xB5, 0xC2, 0xC0, 0xBB, 0xBD, 0xBB, 0xBE, 0xBE,
+ 0x9D, 0x82, 0x85, 0xAC, 0xC1, 0xAF, 0x87, 0x80,
+ 0x9A, 0xBC, 0xB7, 0x90, 0x80, 0x80, 0x7E, 0x82,
+ 0xA6, 0xB1, 0xA4, 0x83, 0x7E, 0x9C, 0xAF, 0xA8,
+ 0x89, 0x7F, 0x98, 0xAF, 0xAB, 0x8F, 0x7D, 0x8E,
+ 0xAC, 0xB1, 0x95, 0x81, 0x80, 0x8C, 0xA6, 0xB5,
+ 0xB6, 0xA2, 0x9A, 0xA3, 0xA2, 0x8D, 0x7F, 0x84,
+ 0x96, 0xA8, 0x9B, 0x83, 0x7C, 0x90, 0xA5, 0x9F,
+ 0x8A, 0x80, 0x7E, 0x7F, 0x80, 0x88, 0x89, 0x85,
+ 0x7E, 0x7D, 0x85, 0x88, 0x88, 0x81, 0x7F, 0x86,
+ 0x89, 0x88, 0x84, 0x7D, 0x82, 0x86, 0x88, 0x84,
+ 0x7E, 0x7F, 0x80, 0x82, 0x88, 0x88, 0x83, 0x7E,
+ 0x80, 0x80, 0x7F, 0x7D, 0x7E, 0x81, 0x81, 0x82,
+ 0x7F, 0x7D, 0x81, 0x82, 0x81, 0x80, 0x7D, 0x7F,
+ 0x80, 0x80, 0x7F, 0x7D, 0x7C, 0x7E, 0x7E, 0x7C,
+ 0x7B, 0x7F, 0x7E, 0x7F, 0x80, 0x7D, 0x7C, 0x7F,
+ 0x80, 0x7F, 0x7E, 0x7E, 0x7F, 0x7E, 0x80, 0x80,
+ 0x7E, 0x7D, 0x81, 0x81, 0x7C, 0x81, 0x81, 0x7F,
+ 0x81, 0x81, 0x7F, 0x7D, 0x7F, 0x81, 0x81, 0x80,
+ 0x7F, 0x7F, 0x81, 0x82, 0x81, 0x7F, 0x81, 0x7F,
+ 0x7E, 0x81, 0x82, 0x7F, 0x7E, 0x81, 0x7F, 0x7F,
+ 0x80, 0x80, 0x81, 0x7D, 0x7C, 0x80, 0x81, 0x80,
+ 0x7D, 0x80, 0x80, 0x7C, 0x7F, 0x81, 0x80, 0x7F,
+ 0x7F, 0x7B, 0x7F, 0x7F, 0x7C, 0x7D, 0x81, 0x81,
+ 0x7D, 0x7D, 0x80, 0x81, 0x7E, 0x7E, 0x7F, 0x81,
+ 0x7F, 0x7B
+};
+const UWORD8 gau1_ihevcd_logo_420sp_uv[] =
+{
+ 0x7F, 0x7E, 0x7D, 0x80, 0x7F, 0x7D, 0x80, 0x7E, 0x7D, 0x80, 0x7E, 0x81, 0x7D, 0x7E, 0x82, 0x7C,
+ 0x80, 0x80, 0x81, 0x81, 0x7F, 0x7E, 0x80, 0x7D, 0x80, 0x80, 0x7F, 0x81, 0x80, 0x7C, 0x7E, 0x81,
+ 0x7B, 0x81, 0x7D, 0x7F, 0x7D, 0x81, 0x81, 0x7E, 0x7E, 0x80, 0x7D, 0x7F, 0x80, 0x7C, 0x80, 0x7F,
+ 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F,
+ 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x81, 0x80, 0x81, 0x80, 0x80, 0x80,
+ 0x81, 0x80, 0x81, 0x7C, 0x81, 0x7B, 0x7F, 0x7F, 0x7D, 0x81, 0x7E, 0x7F, 0x81, 0x7F, 0x7D, 0x81,
+ 0x7B, 0x82, 0x7E, 0x81, 0x7F, 0x80, 0x7E, 0x81, 0x80, 0x7F, 0x80, 0x7D, 0x80, 0x7E, 0x78, 0x86,
+ 0x78, 0x87, 0x7C, 0x83, 0x80, 0x81, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x80,
+ 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x7F, 0x7F, 0x7D, 0x81, 0x7C, 0x81, 0x6E, 0x8C, 0x69, 0x8D, 0x70, 0x89, 0x7B, 0x82,
+ 0x7E, 0x7F, 0x7D, 0x81, 0x82, 0x7A, 0x82, 0x7C, 0x7F, 0x7E, 0x80, 0x7F, 0x80, 0x7C, 0x82, 0x7C,
+ 0x7E, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x71, 0x8C, 0x4B, 0xAE, 0x4A, 0xAE, 0x64, 0x94, 0x7A, 0x80,
+ 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80,
+ 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F,
+ 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7E,
+ 0x77, 0x82, 0x54, 0xA8, 0x43, 0xB5, 0x53, 0xA5, 0x76, 0x83, 0x7F, 0x7D, 0x75, 0x84, 0x75, 0x83,
+ 0x77, 0x83, 0x7E, 0x81, 0x80, 0x7F, 0x7A, 0x81, 0x78, 0x83, 0x74, 0x84, 0x7A, 0x80, 0x7E, 0x7C,
+ 0x66, 0x94, 0x39, 0xBC, 0x34, 0xC0, 0x57, 0x9E, 0x79, 0x82, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x7F,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80,
+ 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x79, 0x82, 0x49, 0xB3, 0x39, 0xC2,
+ 0x4A, 0xAD, 0x77, 0x83, 0x7D, 0x7F, 0x5C, 0x9C, 0x4F, 0xAD, 0x55, 0xA5, 0x6F, 0x89, 0x79, 0x81,
+ 0x64, 0x96, 0x52, 0xA8, 0x52, 0xA3, 0x6B, 0x8C, 0x80, 0x7D, 0x70, 0x87, 0x4F, 0xAD, 0x47, 0xB1,
+ 0x6A, 0x93, 0x7B, 0x82, 0x7E, 0x80, 0x7E, 0x7F, 0x80, 0x7E, 0x7F, 0x7F, 0x7D, 0x80, 0x7F, 0x7F,
+ 0x7E, 0x80, 0x7F, 0x7F, 0x7D, 0x7F, 0x7E, 0x81, 0x7E, 0x80, 0x7D, 0x80, 0x7E, 0x81, 0x80, 0x81,
+ 0x80, 0x80, 0x7F, 0x81, 0x7E, 0x82, 0x81, 0x7C, 0x80, 0x7D, 0x7D, 0x82, 0x7D, 0x81, 0x80, 0x80,
+ 0x7F, 0x7F, 0x7E, 0x7F, 0x7A, 0x83, 0x48, 0xB2, 0x3A, 0xBF, 0x45, 0xB0, 0x78, 0x82, 0x7E, 0x7F,
+ 0x55, 0xA1, 0x3B, 0xC0, 0x42, 0xB7, 0x6D, 0x88, 0x7F, 0x7A, 0x5C, 0x9E, 0x3D, 0xC0, 0x40, 0xB9,
+ 0x64, 0x93, 0x80, 0x80, 0x7E, 0x80, 0x76, 0x86, 0x6C, 0x8B, 0x7F, 0x7E, 0x7D, 0x7E, 0x80, 0x7F,
+ 0x7E, 0x7F, 0x7A, 0x81, 0x7B, 0x80, 0x7C, 0x7E, 0x80, 0x7E, 0x80, 0x7F, 0x80, 0x81, 0x7E, 0x80,
+ 0x80, 0x7F, 0x7F, 0x7D, 0x79, 0x81, 0x7B, 0x81, 0x80, 0x80, 0x80, 0x7E, 0x80, 0x7D, 0x80, 0x7D,
+ 0x7A, 0x81, 0x79, 0x7F, 0x7F, 0x7F, 0x81, 0x7E, 0x80, 0x7F, 0x7F, 0x80, 0x7E, 0x7F, 0x7A, 0x82,
+ 0x48, 0xB2, 0x3A, 0xBE, 0x45, 0xB1, 0x74, 0x85, 0x6D, 0x8E, 0x4E, 0xAA, 0x3B, 0xC0, 0x3F, 0xBB,
+ 0x5F, 0x96, 0x6B, 0x8E, 0x51, 0xA6, 0x3B, 0xBE, 0x3E, 0xB8, 0x58, 0x9D, 0x6B, 0x8E, 0x6A, 0x8E,
+ 0x6C, 0x8C, 0x68, 0x90, 0x76, 0x84, 0x81, 0x7C, 0x7F, 0x81, 0x74, 0x87, 0x60, 0x98, 0x57, 0xA1,
+ 0x5C, 0x9A, 0x66, 0x92, 0x67, 0x91, 0x65, 0x96, 0x63, 0x97, 0x69, 0x8D, 0x65, 0x90, 0x59, 0xA0,
+ 0x5A, 0x9E, 0x65, 0x95, 0x75, 0x82, 0x6F, 0x88, 0x62, 0x96, 0x57, 0xA1, 0x5B, 0x9C, 0x6D, 0x8B,
+ 0x7A, 0x81, 0x81, 0x7E, 0x7D, 0x80, 0x7F, 0x7F, 0x79, 0x83, 0x49, 0xB2, 0x3A, 0xC0, 0x48, 0xB0,
+ 0x6D, 0x8B, 0x44, 0xB7, 0x3E, 0xBE, 0x3E, 0xBE, 0x3D, 0xBF, 0x3C, 0xBE, 0x3C, 0xBE, 0x3B, 0xBD,
+ 0x3A, 0xBF, 0x3B, 0xC2, 0x3F, 0xBF, 0x3F, 0xBA, 0x3E, 0xBE, 0x3A, 0xBE, 0x3D, 0xBD, 0x5B, 0x9B,
+ 0x7C, 0x82, 0x6E, 0x8A, 0x54, 0xA7, 0x3E, 0xBB, 0x34, 0xC3, 0x39, 0xBF, 0x3D, 0xB9, 0x3D, 0xB8,
+ 0x3D, 0xB9, 0x3C, 0xBB, 0x43, 0xB2, 0x44, 0xB2, 0x3B, 0xBE, 0x38, 0xC0, 0x3D, 0xBB, 0x56, 0x9F,
+ 0x49, 0xAD, 0x3B, 0xBA, 0x3D, 0xBD, 0x3B, 0xBD, 0x49, 0xAF, 0x6E, 0x89, 0x7F, 0x7E, 0x80, 0x7D,
+ 0x7E, 0x7F, 0x79, 0x82, 0x49, 0xB2, 0x3A, 0xC0, 0x45, 0xB3, 0x6B, 0x8B, 0x47, 0xB3, 0x43, 0xBA,
+ 0x3A, 0xC0, 0x3C, 0xC0, 0x40, 0xBB, 0x42, 0xB9, 0x3D, 0xBB, 0x39, 0xBF, 0x3D, 0xC0, 0x40, 0xBC,
+ 0x40, 0xB6, 0x40, 0xBA, 0x3F, 0xBB, 0x3F, 0xBB, 0x54, 0x9E, 0x7A, 0x84, 0x56, 0xA2, 0x3B, 0xBE,
+ 0x3B, 0xBE, 0x42, 0xB8, 0x48, 0xB0, 0x42, 0xB9, 0x3F, 0xBD, 0x3D, 0xBE, 0x3C, 0xBF, 0x45, 0xB3,
+ 0x51, 0xA4, 0x4D, 0xAC, 0x41, 0xBA, 0x3B, 0xBF, 0x3B, 0xBC, 0x4B, 0xAA, 0x52, 0xA4, 0x43, 0xB8,
+ 0x35, 0xC2, 0x45, 0xB5, 0x6D, 0x8D, 0x7E, 0x80, 0x7E, 0x81, 0x7E, 0x7E, 0x7A, 0x83, 0x49, 0xB1,
+ 0x3A, 0xBE, 0x46, 0xB1, 0x77, 0x82, 0x76, 0x84, 0x4E, 0xAA, 0x3B, 0xC0, 0x42, 0xBA, 0x62, 0x8C,
+ 0x77, 0x85, 0x56, 0x9F, 0x3A, 0xBF, 0x3E, 0xBC, 0x5F, 0x95, 0x75, 0x85, 0x5F, 0x97, 0x3C, 0xBD,
+ 0x3B, 0xBD, 0x59, 0x9E, 0x6B, 0x8A, 0x46, 0xB4, 0x39, 0xC2, 0x46, 0xB4, 0x66, 0x8F, 0x75, 0x83,
+ 0x6D, 0x89, 0x51, 0xA8, 0x3D, 0xC1, 0x3C, 0xC1, 0x58, 0xA1, 0x7A, 0x85, 0x6E, 0x8B, 0x44, 0xB5,
+ 0x39, 0xC1, 0x43, 0xB5, 0x6F, 0x8A, 0x77, 0x81, 0x5B, 0x9C, 0x3B, 0xBF, 0x3C, 0xBC, 0x66, 0x8F,
+ 0x7F, 0x80, 0x7F, 0x7E, 0x7E, 0x7F, 0x7A, 0x82, 0x48, 0xB1, 0x3A, 0xBF, 0x45, 0xB1, 0x7A, 0x82,
+ 0x7B, 0x7F, 0x4E, 0xA9, 0x3C, 0xC0, 0x44, 0xB8, 0x68, 0x89, 0x7C, 0x83, 0x5C, 0x9D, 0x3B, 0xBF,
+ 0x40, 0xBD, 0x63, 0x93, 0x7D, 0x81, 0x65, 0x94, 0x3B, 0xBE, 0x3E, 0xBA, 0x5B, 0x9C, 0x65, 0x8F,
+ 0x3F, 0xB5, 0x3A, 0xBD, 0x4D, 0xAB, 0x72, 0x85, 0x81, 0x7C, 0x7C, 0x7E, 0x5A, 0x9F, 0x3C, 0xBF,
+ 0x3D, 0xBE, 0x5C, 0x9C, 0x7E, 0x81, 0x71, 0x86, 0x46, 0xB2, 0x3B, 0xBE, 0x46, 0xB1, 0x75, 0x86,
+ 0x7C, 0x80, 0x61, 0x98, 0x3C, 0xBE, 0x3C, 0xBC, 0x66, 0x91, 0x7F, 0x7F, 0x7F, 0x7E, 0x7E, 0x7F,
+ 0x7A, 0x82, 0x49, 0xB1, 0x3B, 0xBF, 0x47, 0xB1, 0x7B, 0x81, 0x7B, 0x81, 0x50, 0xA9, 0x3D, 0xC0,
+ 0x46, 0xB7, 0x69, 0x89, 0x7F, 0x81, 0x5D, 0x9C, 0x3E, 0xBD, 0x40, 0xBB, 0x64, 0x91, 0x7F, 0x80,
+ 0x67, 0x93, 0x3D, 0xBD, 0x3D, 0xBA, 0x5B, 0x9C, 0x69, 0x8E, 0x43, 0xB7, 0x3D, 0xC1, 0x4B, 0xB0,
+ 0x6D, 0x89, 0x7C, 0x7C, 0x78, 0x82, 0x58, 0xA3, 0x3D, 0xBE, 0x3E, 0xBC, 0x5E, 0x9A, 0x7F, 0x82,
+ 0x73, 0x82, 0x48, 0xAF, 0x3E, 0xBF, 0x4B, 0xAE, 0x76, 0x86, 0x7E, 0x7E, 0x63, 0x98, 0x3D, 0xBC,
+ 0x3C, 0xBD, 0x67, 0x90, 0x7F, 0x7F, 0x7F, 0x7F, 0x7E, 0x7F, 0x79, 0x82, 0x48, 0xB1, 0x3A, 0xBE,
+ 0x46, 0xB2, 0x7B, 0x81, 0x7C, 0x80, 0x51, 0xA9, 0x3E, 0xBF, 0x45, 0xB7, 0x68, 0x89, 0x7D, 0x82,
+ 0x5D, 0x9C, 0x3B, 0xBC, 0x3F, 0xBA, 0x64, 0x90, 0x7E, 0x80, 0x68, 0x92, 0x3E, 0xBA, 0x38, 0xC0,
+ 0x59, 0x9F, 0x74, 0x84, 0x51, 0xA8, 0x3D, 0xC0, 0x3F, 0xBD, 0x50, 0xA5, 0x5A, 0x9C, 0x53, 0xA3,
+ 0x43, 0xB5, 0x3C, 0xBF, 0x3F, 0xBB, 0x5D, 0x9C, 0x7F, 0x81, 0x72, 0x84, 0x48, 0xAE, 0x3C, 0xC0,
+ 0x4A, 0xB0, 0x77, 0x85, 0x7E, 0x7F, 0x62, 0x97, 0x3E, 0xBD, 0x3C, 0xBC, 0x67, 0x91, 0x80, 0x7F,
+ 0x80, 0x7F, 0x7F, 0x7E, 0x79, 0x82, 0x4A, 0xB3, 0x38, 0xC1, 0x49, 0xB1, 0x79, 0x84, 0x7C, 0x7E,
+ 0x51, 0xA7, 0x3A, 0xC0, 0x40, 0xB7, 0x67, 0x8B, 0x7B, 0x80, 0x5A, 0x9E, 0x39, 0xBF, 0x3C, 0xBA,
+ 0x63, 0x93, 0x7E, 0x7E, 0x66, 0x93, 0x3D, 0xBB, 0x37, 0xC2, 0x5A, 0x9F, 0x7B, 0x80, 0x65, 0x92,
+ 0x47, 0xB5, 0x38, 0xC2, 0x39, 0xC0, 0x3F, 0xBB, 0x3B, 0xBD, 0x3B, 0xBB, 0x3B, 0xBE, 0x3D, 0xBE,
+ 0x5A, 0x9D, 0x7E, 0x82, 0x71, 0x85, 0x46, 0xAC, 0x3B, 0xC1, 0x47, 0xAF, 0x74, 0x87, 0x7E, 0x80,
+ 0x5E, 0x9A, 0x3A, 0xBC, 0x3E, 0xB7, 0x68, 0x90, 0x7F, 0x80, 0x7E, 0x80, 0x80, 0x7E, 0x7A, 0x82,
+ 0x58, 0xA6, 0x4A, 0xB1, 0x5A, 0xA4, 0x79, 0x83, 0x7E, 0x7E, 0x5D, 0x9C, 0x49, 0xAF, 0x4F, 0xA8,
+ 0x6E, 0x89, 0x7C, 0x7F, 0x64, 0x98, 0x4D, 0xAF, 0x4F, 0xAB, 0x6B, 0x8F, 0x80, 0x7D, 0x6E, 0x8E,
+ 0x4E, 0xAC, 0x4A, 0xB1, 0x66, 0x95, 0x7C, 0x81, 0x7A, 0x80, 0x6B, 0x8C, 0x54, 0xA6, 0x48, 0xB5,
+ 0x4A, 0xB6, 0x57, 0xA2, 0x5D, 0x9A, 0x57, 0xA3, 0x58, 0xA2, 0x6C, 0x8D, 0x80, 0x7F, 0x78, 0x84,
+ 0x5A, 0x96, 0x55, 0xA8, 0x5D, 0x9B, 0x79, 0x83, 0x81, 0x7C, 0x6A, 0x90, 0x53, 0xA5, 0x59, 0x9F,
+ 0x72, 0x8A, 0x7E, 0x80, 0x7F, 0x7E, 0x80, 0x7F, 0x7E, 0x80, 0x73, 0x88, 0x6F, 0x89, 0x76, 0x85,
+ 0x80, 0x7E, 0x7F, 0x7D, 0x75, 0x85, 0x72, 0x88, 0x70, 0x88, 0x7A, 0x81, 0x7F, 0x7F, 0x78, 0x86,
+ 0x72, 0x89, 0x72, 0x88, 0x77, 0x84, 0x80, 0x7D, 0x7D, 0x82, 0x75, 0x86, 0x6F, 0x88, 0x79, 0x84,
+ 0x7F, 0x7E, 0x80, 0x7F, 0x80, 0x80, 0x79, 0x82, 0x72, 0x88, 0x6D, 0x88, 0x79, 0x83, 0x7E, 0x7E,
+ 0x7C, 0x80, 0x7A, 0x80, 0x7D, 0x7F, 0x81, 0x7D, 0x80, 0x7E, 0x77, 0x81, 0x78, 0x81, 0x7D, 0x82,
+ 0x7F, 0x7F, 0x7F, 0x7D, 0x7D, 0x81, 0x7C, 0x82, 0x7B, 0x81, 0x7C, 0x80, 0x7E, 0x7D, 0x81, 0x7F,
+ 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7D, 0x81, 0x7C, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7C,
+ 0x80, 0x7B, 0x80, 0x7F, 0x81, 0x7E, 0x81, 0x7F, 0x81, 0x80, 0x81, 0x7D, 0x7E, 0x7C, 0x80, 0x7F,
+ 0x80, 0x80, 0x81, 0x7F, 0x81, 0x7E, 0x80, 0x7E, 0x7F, 0x7F, 0x80, 0x7E, 0x7C, 0x80, 0x7B, 0x80,
+ 0x7F, 0x7E, 0x80, 0x7D, 0x81, 0x81, 0x7E, 0x81, 0x81, 0x7C, 0x7F, 0x81, 0x7D, 0x81, 0x7E, 0x7F,
+ 0x7C, 0x81, 0x7F, 0x81, 0x80, 0x7F, 0x81, 0x7D, 0x7F, 0x7F, 0x7E, 0x81, 0x7C, 0x81, 0x7F, 0x80,
+ 0x81, 0x7F, 0x80, 0x7F, 0x7C, 0x81, 0x7B, 0x82, 0x7D, 0x81, 0x7E, 0x7F, 0x7E, 0x81, 0x7F, 0x7F,
+ 0x80, 0x7E, 0x7E, 0x81, 0x7E, 0x82, 0x7F, 0x7F, 0x81, 0x7E, 0x7E, 0x81, 0x80, 0x7F, 0x7F, 0x7F,
+ 0x7F, 0x80, 0x7D, 0x80, 0x7C, 0x81, 0x7F, 0x7D, 0x82, 0x7C, 0x7E, 0x80, 0x7E, 0x81, 0x7E, 0x80,
+ 0x80, 0x7D, 0x7C, 0x80, 0x7C, 0x80, 0x7F, 0x7C, 0x80, 0x7F, 0x7D, 0x81, 0x7D, 0x80, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x82, 0x7B, 0x7F, 0x7F, 0x7E, 0x7F, 0x82, 0x7C, 0x7E, 0x7D, 0x7F, 0x81, 0x80, 0x81,
+ 0x7F, 0x7D, 0x7D, 0x7D, 0x80, 0x80, 0x82, 0x81, 0x80, 0x7E, 0x7C, 0x7E, 0x7E, 0x7F, 0x7F, 0x81,
+ 0x7F, 0x7F, 0x81, 0x7B
+};
+const UWORD8 gau1_ihevcd_logo_420sp_vu[] =
+{
+ 0x7E, 0x7F, 0x80, 0x7D, 0x7D, 0x7F, 0x7E, 0x80, 0x80, 0x7D, 0x81, 0x7E, 0x7E, 0x7D, 0x7C, 0x82,
+ 0x80, 0x80, 0x81, 0x81, 0x7E, 0x7F, 0x7D, 0x80, 0x80, 0x80, 0x81, 0x7F, 0x7C, 0x80, 0x81, 0x7E,
+ 0x81, 0x7B, 0x7F, 0x7D, 0x81, 0x7D, 0x7E, 0x81, 0x80, 0x7E, 0x7F, 0x7D, 0x7C, 0x80, 0x7F, 0x80,
+ 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80,
+ 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x81, 0x80, 0x81, 0x80, 0x80,
+ 0x80, 0x81, 0x7C, 0x81, 0x7B, 0x81, 0x7F, 0x7F, 0x81, 0x7D, 0x7F, 0x7E, 0x7F, 0x81, 0x81, 0x7D,
+ 0x82, 0x7B, 0x81, 0x7E, 0x80, 0x7F, 0x81, 0x7E, 0x7F, 0x80, 0x7D, 0x80, 0x7E, 0x80, 0x86, 0x78,
+ 0x87, 0x78, 0x83, 0x7C, 0x81, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7F,
+ 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x7F, 0x7F, 0x81, 0x7D, 0x81, 0x7C, 0x8C, 0x6E, 0x8D, 0x69, 0x89, 0x70, 0x82, 0x7B,
+ 0x7F, 0x7E, 0x81, 0x7D, 0x7A, 0x82, 0x7C, 0x82, 0x7E, 0x7F, 0x7F, 0x80, 0x7C, 0x80, 0x7C, 0x82,
+ 0x80, 0x7E, 0x7F, 0x80, 0x7F, 0x80, 0x8C, 0x71, 0xAE, 0x4B, 0xAE, 0x4A, 0x94, 0x64, 0x80, 0x7A,
+ 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x80, 0x80, 0x80,
+ 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x7F, 0x80,
+ 0x7F, 0x80, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x7E, 0x7F,
+ 0x82, 0x77, 0xA8, 0x54, 0xB5, 0x43, 0xA5, 0x53, 0x83, 0x76, 0x7D, 0x7F, 0x84, 0x75, 0x83, 0x75,
+ 0x83, 0x77, 0x81, 0x7E, 0x7F, 0x80, 0x81, 0x7A, 0x83, 0x78, 0x84, 0x74, 0x80, 0x7A, 0x7C, 0x7E,
+ 0x94, 0x66, 0xBC, 0x39, 0xC0, 0x34, 0x9E, 0x57, 0x82, 0x79, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7F, 0x7F, 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x7F,
+ 0x7F, 0x80, 0x7F, 0x7F, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7F, 0x82, 0x79, 0xB3, 0x49, 0xC2, 0x39,
+ 0xAD, 0x4A, 0x83, 0x77, 0x7F, 0x7D, 0x9C, 0x5C, 0xAD, 0x4F, 0xA5, 0x55, 0x89, 0x6F, 0x81, 0x79,
+ 0x96, 0x64, 0xA8, 0x52, 0xA3, 0x52, 0x8C, 0x6B, 0x7D, 0x80, 0x87, 0x70, 0xAD, 0x4F, 0xB1, 0x47,
+ 0x93, 0x6A, 0x82, 0x7B, 0x80, 0x7E, 0x7F, 0x7E, 0x7E, 0x80, 0x7F, 0x7F, 0x80, 0x7D, 0x7F, 0x7F,
+ 0x80, 0x7E, 0x7F, 0x7F, 0x7F, 0x7D, 0x81, 0x7E, 0x80, 0x7E, 0x80, 0x7D, 0x81, 0x7E, 0x81, 0x80,
+ 0x80, 0x80, 0x81, 0x7F, 0x82, 0x7E, 0x7C, 0x81, 0x7D, 0x80, 0x82, 0x7D, 0x81, 0x7D, 0x80, 0x80,
+ 0x7F, 0x7F, 0x7F, 0x7E, 0x83, 0x7A, 0xB2, 0x48, 0xBF, 0x3A, 0xB0, 0x45, 0x82, 0x78, 0x7F, 0x7E,
+ 0xA1, 0x55, 0xC0, 0x3B, 0xB7, 0x42, 0x88, 0x6D, 0x7A, 0x7F, 0x9E, 0x5C, 0xC0, 0x3D, 0xB9, 0x40,
+ 0x93, 0x64, 0x80, 0x80, 0x80, 0x7E, 0x86, 0x76, 0x8B, 0x6C, 0x7E, 0x7F, 0x7E, 0x7D, 0x7F, 0x80,
+ 0x7F, 0x7E, 0x81, 0x7A, 0x80, 0x7B, 0x7E, 0x7C, 0x7E, 0x80, 0x7F, 0x80, 0x81, 0x80, 0x80, 0x7E,
+ 0x7F, 0x80, 0x7D, 0x7F, 0x81, 0x79, 0x81, 0x7B, 0x80, 0x80, 0x7E, 0x80, 0x7D, 0x80, 0x7D, 0x80,
+ 0x81, 0x7A, 0x7F, 0x79, 0x7F, 0x7F, 0x7E, 0x81, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7E, 0x82, 0x7A,
+ 0xB2, 0x48, 0xBE, 0x3A, 0xB1, 0x45, 0x85, 0x74, 0x8E, 0x6D, 0xAA, 0x4E, 0xC0, 0x3B, 0xBB, 0x3F,
+ 0x96, 0x5F, 0x8E, 0x6B, 0xA6, 0x51, 0xBE, 0x3B, 0xB8, 0x3E, 0x9D, 0x58, 0x8E, 0x6B, 0x8E, 0x6A,
+ 0x8C, 0x6C, 0x90, 0x68, 0x84, 0x76, 0x7C, 0x81, 0x81, 0x7F, 0x87, 0x74, 0x98, 0x60, 0xA1, 0x57,
+ 0x9A, 0x5C, 0x92, 0x66, 0x91, 0x67, 0x96, 0x65, 0x97, 0x63, 0x8D, 0x69, 0x90, 0x65, 0xA0, 0x59,
+ 0x9E, 0x5A, 0x95, 0x65, 0x82, 0x75, 0x88, 0x6F, 0x96, 0x62, 0xA1, 0x57, 0x9C, 0x5B, 0x8B, 0x6D,
+ 0x81, 0x7A, 0x7E, 0x81, 0x80, 0x7D, 0x7F, 0x7F, 0x83, 0x79, 0xB2, 0x49, 0xC0, 0x3A, 0xB0, 0x48,
+ 0x8B, 0x6D, 0xB7, 0x44, 0xBE, 0x3E, 0xBE, 0x3E, 0xBF, 0x3D, 0xBE, 0x3C, 0xBE, 0x3C, 0xBD, 0x3B,
+ 0xBF, 0x3A, 0xC2, 0x3B, 0xBF, 0x3F, 0xBA, 0x3F, 0xBE, 0x3E, 0xBE, 0x3A, 0xBD, 0x3D, 0x9B, 0x5B,
+ 0x82, 0x7C, 0x8A, 0x6E, 0xA7, 0x54, 0xBB, 0x3E, 0xC3, 0x34, 0xBF, 0x39, 0xB9, 0x3D, 0xB8, 0x3D,
+ 0xB9, 0x3D, 0xBB, 0x3C, 0xB2, 0x43, 0xB2, 0x44, 0xBE, 0x3B, 0xC0, 0x38, 0xBB, 0x3D, 0x9F, 0x56,
+ 0xAD, 0x49, 0xBA, 0x3B, 0xBD, 0x3D, 0xBD, 0x3B, 0xAF, 0x49, 0x89, 0x6E, 0x7E, 0x7F, 0x7D, 0x80,
+ 0x7F, 0x7E, 0x82, 0x79, 0xB2, 0x49, 0xC0, 0x3A, 0xB3, 0x45, 0x8B, 0x6B, 0xB3, 0x47, 0xBA, 0x43,
+ 0xC0, 0x3A, 0xC0, 0x3C, 0xBB, 0x40, 0xB9, 0x42, 0xBB, 0x3D, 0xBF, 0x39, 0xC0, 0x3D, 0xBC, 0x40,
+ 0xB6, 0x40, 0xBA, 0x40, 0xBB, 0x3F, 0xBB, 0x3F, 0x9E, 0x54, 0x84, 0x7A, 0xA2, 0x56, 0xBE, 0x3B,
+ 0xBE, 0x3B, 0xB8, 0x42, 0xB0, 0x48, 0xB9, 0x42, 0xBD, 0x3F, 0xBE, 0x3D, 0xBF, 0x3C, 0xB3, 0x45,
+ 0xA4, 0x51, 0xAC, 0x4D, 0xBA, 0x41, 0xBF, 0x3B, 0xBC, 0x3B, 0xAA, 0x4B, 0xA4, 0x52, 0xB8, 0x43,
+ 0xC2, 0x35, 0xB5, 0x45, 0x8D, 0x6D, 0x80, 0x7E, 0x81, 0x7E, 0x7E, 0x7E, 0x83, 0x7A, 0xB1, 0x49,
+ 0xBE, 0x3A, 0xB1, 0x46, 0x82, 0x77, 0x84, 0x76, 0xAA, 0x4E, 0xC0, 0x3B, 0xBA, 0x42, 0x8C, 0x62,
+ 0x85, 0x77, 0x9F, 0x56, 0xBF, 0x3A, 0xBC, 0x3E, 0x95, 0x5F, 0x85, 0x75, 0x97, 0x5F, 0xBD, 0x3C,
+ 0xBD, 0x3B, 0x9E, 0x59, 0x8A, 0x6B, 0xB4, 0x46, 0xC2, 0x39, 0xB4, 0x46, 0x8F, 0x66, 0x83, 0x75,
+ 0x89, 0x6D, 0xA8, 0x51, 0xC1, 0x3D, 0xC1, 0x3C, 0xA1, 0x58, 0x85, 0x7A, 0x8B, 0x6E, 0xB5, 0x44,
+ 0xC1, 0x39, 0xB5, 0x43, 0x8A, 0x6F, 0x81, 0x77, 0x9C, 0x5B, 0xBF, 0x3B, 0xBC, 0x3C, 0x8F, 0x66,
+ 0x80, 0x7F, 0x7E, 0x7F, 0x7F, 0x7E, 0x82, 0x7A, 0xB1, 0x48, 0xBF, 0x3A, 0xB1, 0x45, 0x82, 0x7A,
+ 0x7F, 0x7B, 0xA9, 0x4E, 0xC0, 0x3C, 0xB8, 0x44, 0x89, 0x68, 0x83, 0x7C, 0x9D, 0x5C, 0xBF, 0x3B,
+ 0xBD, 0x40, 0x93, 0x63, 0x81, 0x7D, 0x94, 0x65, 0xBE, 0x3B, 0xBA, 0x3E, 0x9C, 0x5B, 0x8F, 0x65,
+ 0xB5, 0x3F, 0xBD, 0x3A, 0xAB, 0x4D, 0x85, 0x72, 0x7C, 0x81, 0x7E, 0x7C, 0x9F, 0x5A, 0xBF, 0x3C,
+ 0xBE, 0x3D, 0x9C, 0x5C, 0x81, 0x7E, 0x86, 0x71, 0xB2, 0x46, 0xBE, 0x3B, 0xB1, 0x46, 0x86, 0x75,
+ 0x80, 0x7C, 0x98, 0x61, 0xBE, 0x3C, 0xBC, 0x3C, 0x91, 0x66, 0x7F, 0x7F, 0x7E, 0x7F, 0x7F, 0x7E,
+ 0x82, 0x7A, 0xB1, 0x49, 0xBF, 0x3B, 0xB1, 0x47, 0x81, 0x7B, 0x81, 0x7B, 0xA9, 0x50, 0xC0, 0x3D,
+ 0xB7, 0x46, 0x89, 0x69, 0x81, 0x7F, 0x9C, 0x5D, 0xBD, 0x3E, 0xBB, 0x40, 0x91, 0x64, 0x80, 0x7F,
+ 0x93, 0x67, 0xBD, 0x3D, 0xBA, 0x3D, 0x9C, 0x5B, 0x8E, 0x69, 0xB7, 0x43, 0xC1, 0x3D, 0xB0, 0x4B,
+ 0x89, 0x6D, 0x7C, 0x7C, 0x82, 0x78, 0xA3, 0x58, 0xBE, 0x3D, 0xBC, 0x3E, 0x9A, 0x5E, 0x82, 0x7F,
+ 0x82, 0x73, 0xAF, 0x48, 0xBF, 0x3E, 0xAE, 0x4B, 0x86, 0x76, 0x7E, 0x7E, 0x98, 0x63, 0xBC, 0x3D,
+ 0xBD, 0x3C, 0x90, 0x67, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7E, 0x82, 0x79, 0xB1, 0x48, 0xBE, 0x3A,
+ 0xB2, 0x46, 0x81, 0x7B, 0x80, 0x7C, 0xA9, 0x51, 0xBF, 0x3E, 0xB7, 0x45, 0x89, 0x68, 0x82, 0x7D,
+ 0x9C, 0x5D, 0xBC, 0x3B, 0xBA, 0x3F, 0x90, 0x64, 0x80, 0x7E, 0x92, 0x68, 0xBA, 0x3E, 0xC0, 0x38,
+ 0x9F, 0x59, 0x84, 0x74, 0xA8, 0x51, 0xC0, 0x3D, 0xBD, 0x3F, 0xA5, 0x50, 0x9C, 0x5A, 0xA3, 0x53,
+ 0xB5, 0x43, 0xBF, 0x3C, 0xBB, 0x3F, 0x9C, 0x5D, 0x81, 0x7F, 0x84, 0x72, 0xAE, 0x48, 0xC0, 0x3C,
+ 0xB0, 0x4A, 0x85, 0x77, 0x7F, 0x7E, 0x97, 0x62, 0xBD, 0x3E, 0xBC, 0x3C, 0x91, 0x67, 0x7F, 0x80,
+ 0x7F, 0x80, 0x7E, 0x7F, 0x82, 0x79, 0xB3, 0x4A, 0xC1, 0x38, 0xB1, 0x49, 0x84, 0x79, 0x7E, 0x7C,
+ 0xA7, 0x51, 0xC0, 0x3A, 0xB7, 0x40, 0x8B, 0x67, 0x80, 0x7B, 0x9E, 0x5A, 0xBF, 0x39, 0xBA, 0x3C,
+ 0x93, 0x63, 0x7E, 0x7E, 0x93, 0x66, 0xBB, 0x3D, 0xC2, 0x37, 0x9F, 0x5A, 0x80, 0x7B, 0x92, 0x65,
+ 0xB5, 0x47, 0xC2, 0x38, 0xC0, 0x39, 0xBB, 0x3F, 0xBD, 0x3B, 0xBB, 0x3B, 0xBE, 0x3B, 0xBE, 0x3D,
+ 0x9D, 0x5A, 0x82, 0x7E, 0x85, 0x71, 0xAC, 0x46, 0xC1, 0x3B, 0xAF, 0x47, 0x87, 0x74, 0x80, 0x7E,
+ 0x9A, 0x5E, 0xBC, 0x3A, 0xB7, 0x3E, 0x90, 0x68, 0x80, 0x7F, 0x80, 0x7E, 0x7E, 0x80, 0x82, 0x7A,
+ 0xA6, 0x58, 0xB1, 0x4A, 0xA4, 0x5A, 0x83, 0x79, 0x7E, 0x7E, 0x9C, 0x5D, 0xAF, 0x49, 0xA8, 0x4F,
+ 0x89, 0x6E, 0x7F, 0x7C, 0x98, 0x64, 0xAF, 0x4D, 0xAB, 0x4F, 0x8F, 0x6B, 0x7D, 0x80, 0x8E, 0x6E,
+ 0xAC, 0x4E, 0xB1, 0x4A, 0x95, 0x66, 0x81, 0x7C, 0x80, 0x7A, 0x8C, 0x6B, 0xA6, 0x54, 0xB5, 0x48,
+ 0xB6, 0x4A, 0xA2, 0x57, 0x9A, 0x5D, 0xA3, 0x57, 0xA2, 0x58, 0x8D, 0x6C, 0x7F, 0x80, 0x84, 0x78,
+ 0x96, 0x5A, 0xA8, 0x55, 0x9B, 0x5D, 0x83, 0x79, 0x7C, 0x81, 0x90, 0x6A, 0xA5, 0x53, 0x9F, 0x59,
+ 0x8A, 0x72, 0x80, 0x7E, 0x7E, 0x7F, 0x7F, 0x80, 0x80, 0x7E, 0x88, 0x73, 0x89, 0x6F, 0x85, 0x76,
+ 0x7E, 0x80, 0x7D, 0x7F, 0x85, 0x75, 0x88, 0x72, 0x88, 0x70, 0x81, 0x7A, 0x7F, 0x7F, 0x86, 0x78,
+ 0x89, 0x72, 0x88, 0x72, 0x84, 0x77, 0x7D, 0x80, 0x82, 0x7D, 0x86, 0x75, 0x88, 0x6F, 0x84, 0x79,
+ 0x7E, 0x7F, 0x7F, 0x80, 0x80, 0x80, 0x82, 0x79, 0x88, 0x72, 0x88, 0x6D, 0x83, 0x79, 0x7E, 0x7E,
+ 0x80, 0x7C, 0x80, 0x7A, 0x7F, 0x7D, 0x7D, 0x81, 0x7E, 0x80, 0x81, 0x77, 0x81, 0x78, 0x82, 0x7D,
+ 0x7F, 0x7F, 0x7D, 0x7F, 0x81, 0x7D, 0x82, 0x7C, 0x81, 0x7B, 0x80, 0x7C, 0x7D, 0x7E, 0x7F, 0x81,
+ 0x80, 0x7F, 0x80, 0x80, 0x7F, 0x7F, 0x7D, 0x80, 0x7C, 0x81, 0x7E, 0x81, 0x7E, 0x81, 0x7C, 0x81,
+ 0x7B, 0x80, 0x7F, 0x80, 0x7E, 0x81, 0x7F, 0x81, 0x80, 0x81, 0x7D, 0x81, 0x7C, 0x7E, 0x7F, 0x80,
+ 0x80, 0x80, 0x7F, 0x81, 0x7E, 0x81, 0x7E, 0x80, 0x7F, 0x7F, 0x7E, 0x80, 0x80, 0x7C, 0x80, 0x7B,
+ 0x7E, 0x7F, 0x7D, 0x80, 0x81, 0x81, 0x81, 0x7E, 0x7C, 0x81, 0x81, 0x7F, 0x81, 0x7D, 0x7F, 0x7E,
+ 0x81, 0x7C, 0x81, 0x7F, 0x7F, 0x80, 0x7D, 0x81, 0x7F, 0x7F, 0x81, 0x7E, 0x81, 0x7C, 0x80, 0x7F,
+ 0x7F, 0x81, 0x7F, 0x80, 0x81, 0x7C, 0x82, 0x7B, 0x81, 0x7D, 0x7F, 0x7E, 0x81, 0x7E, 0x7F, 0x7F,
+ 0x7E, 0x80, 0x81, 0x7E, 0x82, 0x7E, 0x7F, 0x7F, 0x7E, 0x81, 0x81, 0x7E, 0x7F, 0x80, 0x7F, 0x7F,
+ 0x80, 0x7F, 0x80, 0x7D, 0x81, 0x7C, 0x7D, 0x7F, 0x7C, 0x82, 0x80, 0x7E, 0x81, 0x7E, 0x80, 0x7E,
+ 0x7D, 0x80, 0x80, 0x7C, 0x80, 0x7C, 0x7C, 0x7F, 0x7F, 0x80, 0x81, 0x7D, 0x80, 0x7D, 0x7F, 0x7F,
+ 0x7F, 0x7F, 0x7B, 0x82, 0x7F, 0x7F, 0x7F, 0x7E, 0x7C, 0x82, 0x7D, 0x7E, 0x81, 0x7F, 0x81, 0x80,
+ 0x7D, 0x7F, 0x7D, 0x7D, 0x80, 0x80, 0x81, 0x82, 0x7E, 0x80, 0x7E, 0x7C, 0x7F, 0x7E, 0x81, 0x7F,
+ 0x7F, 0x7F, 0x7B, 0x81,
+
+};
+
+const UWORD8 gau1_ihevcd_logo_420_u[] =
+{
+ 0x80, 0x7c, 0x7a, 0x7c, 0x80, 0x81, 0x7d, 0x78,
+ 0x7c, 0x79, 0x79, 0x7d, 0x83, 0x86, 0x84, 0x80,
+ 0x7f, 0x81, 0x82, 0x84, 0x84, 0x82, 0x81, 0x7f,
+ 0x81, 0x83, 0x83, 0x7f, 0x78, 0x76, 0x79, 0x7d,
+ 0x76, 0x7b, 0x80, 0x81, 0x7d, 0x7b, 0x7d, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x82, 0x80, 0x7f, 0x80, 0x84, 0x85, 0x83, 0x80,
+ 0x84, 0x81, 0x7e, 0x7e, 0x80, 0x81, 0x80, 0x7e,
+ 0x7f, 0x7f, 0x7f, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f,
+ 0x7c, 0x7f, 0x80, 0x7f, 0x7c, 0x7c, 0x80, 0x83,
+ 0x7e, 0x82, 0x84, 0x83, 0x80, 0x7e, 0x7f, 0x81,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x83, 0x83, 0x83, 0x83, 0x84, 0x85, 0x86, 0x86,
+ 0x8a, 0x87, 0x82, 0x7e, 0x7d, 0x7d, 0x7e, 0x7f,
+ 0x81, 0x7f, 0x7c, 0x7a, 0x7a, 0x7c, 0x7f, 0x81,
+ 0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x83, 0x84, 0x85,
+ 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x80, 0x81,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7f, 0x81, 0x81, 0x7f, 0x7b, 0x7a, 0x7d, 0x80,
+ 0x82, 0x83, 0x81, 0x7e, 0x7b, 0x7c, 0x82, 0x86,
+ 0x85, 0x82, 0x7e, 0x7b, 0x7b, 0x7e, 0x82, 0x85,
+ 0x80, 0x7f, 0x7f, 0x81, 0x84, 0x84, 0x80, 0x7b,
+ 0x66, 0x64, 0x64, 0x69, 0x73, 0x7b, 0x80, 0x81,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7b, 0x7d, 0x7d, 0x74, 0x69, 0x64, 0x68, 0x6d,
+ 0x70, 0x77, 0x7e, 0x7f, 0x7c, 0x7d, 0x84, 0x8c,
+ 0x85, 0x84, 0x81, 0x80, 0x80, 0x81, 0x84, 0x85,
+ 0x86, 0x82, 0x7f, 0x81, 0x84, 0x80, 0x75, 0x6b,
+ 0x45, 0x41, 0x42, 0x4e, 0x62, 0x74, 0x7d, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7b, 0x7e, 0x7a, 0x6b, 0x57, 0x4c, 0x4f, 0x56,
+ 0x60, 0x6e, 0x7d, 0x81, 0x7c, 0x78, 0x7d, 0x84,
+ 0x7e, 0x7f, 0x82, 0x83, 0x83, 0x82, 0x7f, 0x7e,
+ 0x84, 0x7d, 0x7a, 0x7e, 0x83, 0x7f, 0x6f, 0x60,
+ 0x34, 0x2d, 0x2d, 0x3d, 0x58, 0x71, 0x7c, 0x7e,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x83, 0x7d, 0x67, 0x4b, 0x3a, 0x3c, 0x44,
+ 0x5a, 0x6e, 0x82, 0x86, 0x7b, 0x6f, 0x6c, 0x70,
+ 0x6f, 0x75, 0x7c, 0x82, 0x82, 0x7c, 0x75, 0x6f,
+ 0x76, 0x70, 0x6e, 0x77, 0x83, 0x82, 0x71, 0x60,
+ 0x3b, 0x31, 0x2c, 0x3b, 0x59, 0x72, 0x7d, 0x7d,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x86, 0x89, 0x81, 0x67, 0x46, 0x32, 0x32, 0x3a,
+ 0x5a, 0x71, 0x88, 0x8a, 0x7a, 0x66, 0x5e, 0x5f,
+ 0x63, 0x6b, 0x77, 0x7f, 0x7f, 0x77, 0x6b, 0x63,
+ 0x69, 0x63, 0x64, 0x72, 0x83, 0x86, 0x76, 0x65,
+ 0x49, 0x3c, 0x35, 0x41, 0x5d, 0x75, 0x7e, 0x7d,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x52, 0x77, 0x84, 0x7b, 0x69, 0x41, 0x31, 0x4a,
+ 0x38, 0x4a, 0x62, 0x75, 0x76, 0x67, 0x51, 0x41,
+ 0x39, 0x39, 0x43, 0x5d, 0x7a, 0x86, 0x7d, 0x6f,
+ 0x5f, 0x49, 0x49, 0x67, 0x7b, 0x77, 0x79, 0x86,
+ 0x7a, 0x7c, 0x7e, 0x80, 0x80, 0x7e, 0x7c, 0x7a,
+ 0x7f, 0x7f, 0x7e, 0x7e, 0x7d, 0x7d, 0x7c, 0x7c,
+ 0x7e, 0x7e, 0x7d, 0x7d, 0x7d, 0x7e, 0x7f, 0x7f,
+ 0x80, 0x7e, 0x7b, 0x7a, 0x7c, 0x80, 0x85, 0x88,
+ 0x7c, 0x7c, 0x7c, 0x7c, 0x7e, 0x7f, 0x81, 0x82,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4b, 0x76, 0x8a, 0x85, 0x74, 0x4a, 0x32, 0x42,
+ 0x35, 0x4c, 0x6c, 0x83, 0x85, 0x70, 0x52, 0x3d,
+ 0x39, 0x3a, 0x46, 0x61, 0x7e, 0x8c, 0x86, 0x7b,
+ 0x7c, 0x66, 0x61, 0x76, 0x83, 0x7d, 0x7a, 0x83,
+ 0x80, 0x7f, 0x7f, 0x7e, 0x7e, 0x7e, 0x7f, 0x80,
+ 0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80,
+ 0x85, 0x83, 0x7f, 0x7d, 0x7d, 0x7f, 0x82, 0x85,
+ 0x7f, 0x81, 0x84, 0x85, 0x84, 0x81, 0x7c, 0x79,
+ 0x81, 0x81, 0x81, 0x81, 0x80, 0x7f, 0x7f, 0x7e,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x45, 0x76, 0x8c, 0x85, 0x77, 0x51, 0x33, 0x3a,
+ 0x34, 0x4c, 0x6f, 0x88, 0x89, 0x72, 0x51, 0x3a,
+ 0x39, 0x3c, 0x48, 0x61, 0x7b, 0x89, 0x86, 0x7e,
+ 0x90, 0x7c, 0x73, 0x7e, 0x87, 0x82, 0x7d, 0x7f,
+ 0x86, 0x81, 0x7a, 0x74, 0x71, 0x73, 0x77, 0x7a,
+ 0x7f, 0x7f, 0x7e, 0x7e, 0x7d, 0x7c, 0x7c, 0x7c,
+ 0x84, 0x7f, 0x78, 0x72, 0x72, 0x76, 0x7c, 0x81,
+ 0x7d, 0x81, 0x85, 0x86, 0x81, 0x77, 0x6b, 0x64,
+ 0x7d, 0x7f, 0x81, 0x82, 0x82, 0x80, 0x7d, 0x7b,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x48, 0x76, 0x80, 0x6e, 0x63, 0x4c, 0x36, 0x3a,
+ 0x36, 0x47, 0x5f, 0x70, 0x71, 0x61, 0x4b, 0x3a,
+ 0x39, 0x3b, 0x45, 0x56, 0x69, 0x72, 0x70, 0x6a,
+ 0x7b, 0x6e, 0x67, 0x6f, 0x7c, 0x82, 0x80, 0x7e,
+ 0x84, 0x7b, 0x6c, 0x5f, 0x58, 0x58, 0x5c, 0x60,
+ 0x68, 0x68, 0x68, 0x67, 0x66, 0x66, 0x65, 0x65,
+ 0x71, 0x6a, 0x61, 0x59, 0x58, 0x5d, 0x64, 0x6a,
+ 0x79, 0x77, 0x73, 0x6d, 0x65, 0x5c, 0x55, 0x51,
+ 0x66, 0x6c, 0x74, 0x7b, 0x80, 0x80, 0x7e, 0x7c,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x51, 0x76, 0x6e, 0x4b, 0x44, 0x42, 0x3a, 0x3f,
+ 0x3b, 0x3f, 0x45, 0x4a, 0x4a, 0x47, 0x41, 0x3d,
+ 0x38, 0x3a, 0x3f, 0x47, 0x4e, 0x51, 0x4e, 0x4a,
+ 0x4c, 0x49, 0x47, 0x51, 0x67, 0x7c, 0x82, 0x7e,
+ 0x76, 0x6b, 0x59, 0x47, 0x3d, 0x3a, 0x3d, 0x40,
+ 0x4a, 0x4a, 0x49, 0x49, 0x48, 0x48, 0x47, 0x47,
+ 0x56, 0x4f, 0x46, 0x3e, 0x3c, 0x3f, 0x46, 0x4a,
+ 0x6d, 0x64, 0x56, 0x49, 0x41, 0x40, 0x43, 0x45,
+ 0x45, 0x50, 0x60, 0x70, 0x7c, 0x81, 0x82, 0x80,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x54, 0x76, 0x63, 0x34, 0x31, 0x3d, 0x3d, 0x40,
+ 0x3d, 0x3a, 0x36, 0x33, 0x33, 0x36, 0x3a, 0x3d,
+ 0x38, 0x3a, 0x3c, 0x3c, 0x3c, 0x3a, 0x37, 0x36,
+ 0x2c, 0x31, 0x33, 0x3a, 0x55, 0x77, 0x82, 0x79,
+ 0x5e, 0x55, 0x47, 0x39, 0x30, 0x2e, 0x30, 0x32,
+ 0x38, 0x38, 0x37, 0x37, 0x36, 0x36, 0x35, 0x35,
+ 0x45, 0x41, 0x3b, 0x35, 0x32, 0x32, 0x34, 0x36,
+ 0x57, 0x50, 0x44, 0x39, 0x34, 0x35, 0x3a, 0x3e,
+ 0x2e, 0x3c, 0x53, 0x6a, 0x7a, 0x82, 0x83, 0x82,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4f, 0x76, 0x64, 0x34, 0x34, 0x44, 0x3e, 0x38,
+ 0x3c, 0x3a, 0x38, 0x37, 0x37, 0x38, 0x39, 0x3a,
+ 0x38, 0x3b, 0x3d, 0x3c, 0x39, 0x37, 0x38, 0x39,
+ 0x31, 0x39, 0x38, 0x38, 0x51, 0x75, 0x7f, 0x71,
+ 0x45, 0x41, 0x3b, 0x37, 0x35, 0x38, 0x3c, 0x3f,
+ 0x3a, 0x3a, 0x39, 0x39, 0x38, 0x37, 0x37, 0x37,
+ 0x47, 0x46, 0x45, 0x42, 0x3e, 0x3a, 0x37, 0x35,
+ 0x3f, 0x41, 0x44, 0x46, 0x45, 0x41, 0x3c, 0x39,
+ 0x2b, 0x3a, 0x52, 0x6b, 0x7c, 0x82, 0x81, 0x7f,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x47, 0x75, 0x6a, 0x3e, 0x3f, 0x4d, 0x3f, 0x2f,
+ 0x39, 0x3d, 0x42, 0x45, 0x45, 0x41, 0x3b, 0x37,
+ 0x38, 0x3d, 0x41, 0x40, 0x3d, 0x3d, 0x41, 0x45,
+ 0x44, 0x4c, 0x47, 0x3f, 0x53, 0x76, 0x7c, 0x6a,
+ 0x35, 0x36, 0x37, 0x3b, 0x40, 0x47, 0x4d, 0x51,
+ 0x43, 0x43, 0x42, 0x42, 0x41, 0x41, 0x40, 0x40,
+ 0x50, 0x52, 0x53, 0x52, 0x4e, 0x48, 0x41, 0x3c,
+ 0x2e, 0x3a, 0x4c, 0x5a, 0x5b, 0x51, 0x41, 0x35,
+ 0x30, 0x3f, 0x58, 0x6f, 0x7e, 0x82, 0x7f, 0x7b,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4c, 0x7f, 0x7c, 0x7e, 0x6f, 0x40, 0x3a, 0x3d,
+ 0x38, 0x48, 0x61, 0x77, 0x7d, 0x6d, 0x4f, 0x37,
+ 0x37, 0x38, 0x45, 0x5e, 0x77, 0x7c, 0x6b, 0x58,
+ 0x38, 0x3a, 0x37, 0x3d, 0x5a, 0x75, 0x6c, 0x4f,
+ 0x3a, 0x38, 0x39, 0x44, 0x56, 0x67, 0x72, 0x77,
+ 0x6e, 0x63, 0x51, 0x41, 0x39, 0x39, 0x3e, 0x42,
+ 0x6e, 0x7a, 0x80, 0x72, 0x55, 0x3d, 0x36, 0x39,
+ 0x33, 0x54, 0x71, 0x79, 0x79, 0x71, 0x54, 0x33,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4c, 0x7f, 0x7c, 0x7e, 0x6f, 0x40, 0x3a, 0x3d,
+ 0x3a, 0x49, 0x62, 0x79, 0x7f, 0x6e, 0x50, 0x39,
+ 0x38, 0x39, 0x46, 0x60, 0x79, 0x7e, 0x6d, 0x5a,
+ 0x38, 0x3a, 0x39, 0x3f, 0x5b, 0x74, 0x68, 0x49,
+ 0x3a, 0x38, 0x3a, 0x47, 0x5b, 0x6e, 0x7a, 0x7f,
+ 0x79, 0x6b, 0x56, 0x43, 0x38, 0x38, 0x3e, 0x44,
+ 0x70, 0x7c, 0x82, 0x73, 0x56, 0x3e, 0x37, 0x3a,
+ 0x34, 0x55, 0x73, 0x7b, 0x7b, 0x73, 0x55, 0x34,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4d, 0x80, 0x7d, 0x7e, 0x70, 0x41, 0x3b, 0x3e,
+ 0x3c, 0x4b, 0x64, 0x7b, 0x81, 0x70, 0x52, 0x3b,
+ 0x39, 0x3b, 0x47, 0x62, 0x7b, 0x80, 0x70, 0x5c,
+ 0x37, 0x3b, 0x3b, 0x43, 0x5d, 0x73, 0x63, 0x42,
+ 0x3a, 0x38, 0x3c, 0x4b, 0x61, 0x76, 0x84, 0x89,
+ 0x86, 0x76, 0x5d, 0x45, 0x38, 0x38, 0x3f, 0x45,
+ 0x72, 0x7e, 0x84, 0x75, 0x58, 0x40, 0x39, 0x3c,
+ 0x37, 0x58, 0x75, 0x7d, 0x7d, 0x75, 0x58, 0x37,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4d, 0x80, 0x7d, 0x7f, 0x70, 0x41, 0x3b, 0x3e,
+ 0x3d, 0x4d, 0x66, 0x7d, 0x83, 0x72, 0x54, 0x3d,
+ 0x3a, 0x3c, 0x49, 0x63, 0x7d, 0x83, 0x72, 0x5f,
+ 0x37, 0x3c, 0x3d, 0x44, 0x5e, 0x72, 0x61, 0x3f,
+ 0x3a, 0x39, 0x3d, 0x4c, 0x63, 0x78, 0x86, 0x8c,
+ 0x8c, 0x7a, 0x5f, 0x46, 0x38, 0x38, 0x3f, 0x46,
+ 0x74, 0x7f, 0x85, 0x77, 0x5a, 0x42, 0x3b, 0x3e,
+ 0x39, 0x5a, 0x77, 0x7f, 0x7f, 0x77, 0x5a, 0x39,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4e, 0x81, 0x7e, 0x7f, 0x71, 0x42, 0x3c, 0x3f,
+ 0x3e, 0x4e, 0x67, 0x7d, 0x83, 0x73, 0x55, 0x3d,
+ 0x3a, 0x3c, 0x49, 0x63, 0x7e, 0x84, 0x74, 0x61,
+ 0x39, 0x3c, 0x3c, 0x43, 0x5e, 0x74, 0x65, 0x45,
+ 0x3c, 0x3a, 0x3d, 0x49, 0x5d, 0x70, 0x7d, 0x82,
+ 0x81, 0x72, 0x5a, 0x44, 0x39, 0x39, 0x40, 0x46,
+ 0x74, 0x80, 0x86, 0x78, 0x5a, 0x43, 0x3c, 0x3e,
+ 0x3a, 0x5b, 0x79, 0x81, 0x81, 0x79, 0x5b, 0x3a,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4e, 0x81, 0x7e, 0x80, 0x71, 0x43, 0x3d, 0x3f,
+ 0x3d, 0x4d, 0x66, 0x7d, 0x83, 0x72, 0x54, 0x3d,
+ 0x38, 0x3a, 0x48, 0x63, 0x7d, 0x84, 0x74, 0x61,
+ 0x3b, 0x3c, 0x39, 0x3e, 0x5b, 0x77, 0x6e, 0x52,
+ 0x3f, 0x3b, 0x3a, 0x42, 0x51, 0x5f, 0x68, 0x6b,
+ 0x68, 0x5e, 0x4e, 0x40, 0x39, 0x3a, 0x40, 0x45,
+ 0x74, 0x80, 0x86, 0x77, 0x5a, 0x42, 0x3b, 0x3e,
+ 0x3a, 0x5b, 0x79, 0x81, 0x81, 0x79, 0x5b, 0x3a,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4f, 0x82, 0x7f, 0x80, 0x72, 0x43, 0x3d, 0x40,
+ 0x3c, 0x4c, 0x65, 0x7b, 0x81, 0x71, 0x53, 0x3b,
+ 0x36, 0x38, 0x46, 0x61, 0x7c, 0x83, 0x73, 0x61,
+ 0x3e, 0x3c, 0x35, 0x39, 0x59, 0x7a, 0x79, 0x62,
+ 0x42, 0x3c, 0x37, 0x3a, 0x43, 0x4c, 0x51, 0x52,
+ 0x4b, 0x47, 0x41, 0x3b, 0x3a, 0x3c, 0x40, 0x43,
+ 0x72, 0x7e, 0x84, 0x76, 0x59, 0x41, 0x3a, 0x3d,
+ 0x39, 0x5b, 0x78, 0x80, 0x80, 0x78, 0x5b, 0x39,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x83, 0x7a, 0x7f, 0x6d, 0x40, 0x33, 0x3e, 0x3a,
+ 0x4f, 0x82, 0x7f, 0x81, 0x72, 0x43, 0x3d, 0x40,
+ 0x3b, 0x4b, 0x64, 0x7b, 0x81, 0x70, 0x52, 0x3a,
+ 0x35, 0x37, 0x45, 0x60, 0x7b, 0x82, 0x73, 0x60,
+ 0x40, 0x3c, 0x32, 0x35, 0x57, 0x7c, 0x80, 0x6c,
+ 0x44, 0x3d, 0x35, 0x34, 0x3a, 0x40, 0x42, 0x41,
+ 0x38, 0x38, 0x38, 0x38, 0x3a, 0x3d, 0x40, 0x42,
+ 0x72, 0x7d, 0x83, 0x75, 0x58, 0x40, 0x39, 0x3c,
+ 0x39, 0x5a, 0x77, 0x7f, 0x7f, 0x77, 0x5a, 0x39,
+ 0x3e, 0x35, 0x42, 0x67, 0x80, 0x80, 0x7d, 0x81,
+ 0x7f, 0x82, 0x7c, 0x63, 0x44, 0x31, 0x33, 0x3c,
+ 0x58, 0x6f, 0x85, 0x83, 0x69, 0x49, 0x35, 0x2e,
+ 0x31, 0x46, 0x65, 0x7b, 0x7d, 0x6a, 0x4d, 0x39,
+ 0x30, 0x35, 0x45, 0x63, 0x7d, 0x82, 0x71, 0x5d,
+ 0x39, 0x30, 0x2e, 0x40, 0x5e, 0x76, 0x7c, 0x79,
+ 0x63, 0x57, 0x44, 0x34, 0x2d, 0x2f, 0x37, 0x3d,
+ 0x35, 0x39, 0x3c, 0x3b, 0x36, 0x34, 0x37, 0x3c,
+ 0x74, 0x7e, 0x81, 0x70, 0x54, 0x3d, 0x38, 0x3c,
+ 0x3a, 0x4f, 0x6d, 0x83, 0x83, 0x6d, 0x4f, 0x3a,
+ 0x29, 0x37, 0x4e, 0x65, 0x76, 0x7d, 0x7d, 0x7b,
+ 0x7f, 0x82, 0x7d, 0x69, 0x4e, 0x3f, 0x41, 0x4a,
+ 0x5f, 0x73, 0x85, 0x83, 0x6d, 0x53, 0x44, 0x3f,
+ 0x3f, 0x51, 0x6a, 0x7c, 0x7e, 0x6e, 0x57, 0x47,
+ 0x41, 0x43, 0x4f, 0x67, 0x7e, 0x83, 0x75, 0x64,
+ 0x48, 0x40, 0x3d, 0x4b, 0x65, 0x79, 0x7e, 0x7a,
+ 0x74, 0x69, 0x59, 0x4a, 0x41, 0x3f, 0x42, 0x45,
+ 0x4c, 0x51, 0x54, 0x52, 0x4d, 0x4c, 0x4f, 0x53,
+ 0x79, 0x82, 0x86, 0x79, 0x60, 0x4e, 0x4b, 0x4f,
+ 0x4e, 0x5e, 0x75, 0x85, 0x85, 0x75, 0x5e, 0x4e,
+ 0x42, 0x4c, 0x5d, 0x6e, 0x7a, 0x80, 0x81, 0x81,
+ 0x7e, 0x82, 0x80, 0x71, 0x5f, 0x55, 0x59, 0x60,
+ 0x6b, 0x79, 0x86, 0x83, 0x73, 0x62, 0x5b, 0x5a,
+ 0x55, 0x61, 0x72, 0x7e, 0x80, 0x75, 0x66, 0x5c,
+ 0x5b, 0x5a, 0x5f, 0x6e, 0x7f, 0x84, 0x7b, 0x6f,
+ 0x61, 0x59, 0x55, 0x5e, 0x70, 0x7e, 0x7f, 0x7c,
+ 0x84, 0x7d, 0x73, 0x67, 0x5e, 0x58, 0x55, 0x55,
+ 0x6a, 0x6e, 0x72, 0x70, 0x6b, 0x69, 0x6d, 0x71,
+ 0x7d, 0x85, 0x8a, 0x82, 0x71, 0x65, 0x64, 0x68,
+ 0x6a, 0x72, 0x7d, 0x84, 0x84, 0x7d, 0x72, 0x6a,
+ 0x65, 0x69, 0x70, 0x77, 0x7e, 0x82, 0x85, 0x86,
+ 0x7e, 0x81, 0x81, 0x7a, 0x70, 0x6b, 0x6f, 0x75,
+ 0x77, 0x7f, 0x85, 0x83, 0x79, 0x72, 0x71, 0x74,
+ 0x6c, 0x72, 0x7a, 0x80, 0x81, 0x7c, 0x75, 0x70,
+ 0x74, 0x70, 0x6f, 0x75, 0x7f, 0x84, 0x80, 0x7a,
+ 0x77, 0x71, 0x6d, 0x70, 0x7a, 0x81, 0x81, 0x7d,
+ 0x85, 0x83, 0x80, 0x7c, 0x76, 0x71, 0x6c, 0x6a,
+ 0x7c, 0x80, 0x83, 0x81, 0x7d, 0x7b, 0x7e, 0x83,
+ 0x7c, 0x83, 0x88, 0x85, 0x7c, 0x76, 0x77, 0x7a,
+ 0x7e, 0x7f, 0x7f, 0x80, 0x80, 0x7f, 0x7f, 0x7e,
+ 0x7f, 0x7e, 0x7d, 0x7c, 0x7d, 0x80, 0x83, 0x85,
+ 0x7f, 0x81, 0x81, 0x7f, 0x7c, 0x7b, 0x7e, 0x81,
+ 0x7f, 0x82, 0x83, 0x82, 0x7f, 0x7e, 0x80, 0x83,
+ 0x7b, 0x7d, 0x7f, 0x81, 0x81, 0x80, 0x7f, 0x7d,
+ 0x82, 0x7e, 0x7b, 0x7c, 0x7f, 0x82, 0x82, 0x81,
+ 0x84, 0x80, 0x7d, 0x7d, 0x80, 0x82, 0x80, 0x7e,
+ 0x7b, 0x7d, 0x7f, 0x82, 0x82, 0x80, 0x7e, 0x7c,
+ 0x7e, 0x82, 0x86, 0x84, 0x7f, 0x7d, 0x81, 0x85,
+ 0x7a, 0x7d, 0x81, 0x81, 0x7f, 0x7e, 0x7f, 0x81,
+ 0x85, 0x82, 0x7e, 0x7b, 0x7b, 0x7e, 0x82, 0x85,
+ 0x89, 0x86, 0x80, 0x7b, 0x7a, 0x7b, 0x7e, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x81, 0x82, 0x83, 0x83,
+ 0x82, 0x81, 0x80, 0x80, 0x82, 0x83, 0x85, 0x85,
+ 0x82, 0x81, 0x81, 0x80, 0x80, 0x81, 0x81, 0x82,
+ 0x83, 0x82, 0x81, 0x80, 0x7f, 0x80, 0x81, 0x83,
+ 0x85, 0x84, 0x84, 0x82, 0x80, 0x7f, 0x7f, 0x7f,
+ 0x76, 0x78, 0x7c, 0x7f, 0x82, 0x84, 0x84, 0x84,
+ 0x7b, 0x7f, 0x83, 0x81, 0x7c, 0x7a, 0x7e, 0x82,
+ 0x7a, 0x7b, 0x7c, 0x7d, 0x7f, 0x80, 0x80, 0x80,
+ 0x82, 0x80, 0x7d, 0x7b, 0x7b, 0x7d, 0x80, 0x82,
+ 0x86, 0x83, 0x7f, 0x7b, 0x79, 0x7a, 0x7c, 0x7d,
+ 0x81, 0x7f, 0x7d, 0x7f, 0x82, 0x83, 0x80, 0x7d,
+ 0x81, 0x7f, 0x7d, 0x7f, 0x83, 0x84, 0x82, 0x80,
+ 0x81, 0x81, 0x80, 0x7f, 0x7f, 0x7f, 0x7f, 0x80,
+ 0x7d, 0x80, 0x83, 0x82, 0x7f, 0x7d, 0x7f, 0x81,
+ 0x7e, 0x81, 0x83, 0x81, 0x7e, 0x7c, 0x7d, 0x7f,
+ 0x7e, 0x7e, 0x7d, 0x7d, 0x7e, 0x7f, 0x81, 0x82,
+ 0x7b, 0x80, 0x83, 0x81, 0x7c, 0x7b, 0x7e, 0x82,
+ 0x80, 0x7d, 0x7b, 0x7c, 0x7f, 0x81, 0x80, 0x7e,
+ 0x7d, 0x7f, 0x80, 0x82, 0x82, 0x80, 0x7f, 0x7d,
+ 0x7f, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
+ 0x82, 0x7e, 0x7c, 0x7d, 0x81, 0x81, 0x7d, 0x78,
+ 0x7f, 0x7d, 0x7b, 0x7e, 0x83, 0x84, 0x7f, 0x7a,
+ 0x7f, 0x7f, 0x7f, 0x7e, 0x7e, 0x7d, 0x7d, 0x7d,
+ 0x76, 0x7c, 0x82, 0x83, 0x7f, 0x7c, 0x7c, 0x7f,
+ 0x78, 0x7d, 0x81, 0x80, 0x7b, 0x79, 0x7b, 0x7f,
+ 0x89, 0x86, 0x81, 0x7d, 0x7a, 0x7a, 0x7c, 0x7e,
+ 0x7e, 0x83, 0x86, 0x84, 0x7f, 0x7e, 0x81, 0x85,
+ 0x86, 0x81, 0x7d, 0x7d, 0x81, 0x83, 0x80, 0x7d,
+ 0x7a, 0x7e, 0x84, 0x88, 0x88, 0x84, 0x7e, 0x7a,
+ 0x7a, 0x7b, 0x7e, 0x81, 0x83, 0x83, 0x82, 0x82,
+};
+
+const UWORD8 gau1_ihevcd_logo_420_v[] =
+{
+ 0x7b, 0x7f, 0x81, 0x80, 0x7c, 0x7c, 0x80, 0x85,
+ 0x87, 0x87, 0x86, 0x80, 0x79, 0x78, 0x7d, 0x82,
+ 0x84, 0x81, 0x7d, 0x7a, 0x7a, 0x7d, 0x81, 0x84,
+ 0x7c, 0x7a, 0x7a, 0x7f, 0x85, 0x88, 0x85, 0x80,
+ 0x83, 0x80, 0x7d, 0x7e, 0x81, 0x81, 0x7c, 0x77,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7d, 0x7f, 0x81, 0x7f, 0x7c, 0x7c, 0x7e, 0x81,
+ 0x7d, 0x7f, 0x80, 0x7f, 0x7d, 0x7d, 0x80, 0x84,
+ 0x83, 0x82, 0x81, 0x80, 0x80, 0x81, 0x82, 0x83,
+ 0x82, 0x80, 0x7e, 0x80, 0x82, 0x82, 0x7f, 0x7b,
+ 0x7f, 0x7d, 0x7c, 0x7e, 0x81, 0x82, 0x7f, 0x7b,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x7f, 0x7e, 0x7d, 0x7d,
+ 0x76, 0x79, 0x7c, 0x7f, 0x80, 0x81, 0x81, 0x81,
+ 0x80, 0x82, 0x83, 0x85, 0x85, 0x83, 0x82, 0x80,
+ 0x85, 0x84, 0x82, 0x80, 0x7e, 0x7d, 0x7b, 0x7b,
+ 0x83, 0x83, 0x84, 0x85, 0x85, 0x83, 0x81, 0x7f,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x82, 0x80, 0x80, 0x82, 0x85, 0x86, 0x83, 0x80,
+ 0x7d, 0x7c, 0x7c, 0x7f, 0x82, 0x82, 0x7d, 0x78,
+ 0x7b, 0x7e, 0x81, 0x84, 0x84, 0x81, 0x7e, 0x7b,
+ 0x81, 0x82, 0x81, 0x7f, 0x7c, 0x7c, 0x81, 0x85,
+ 0x97, 0x99, 0x9a, 0x96, 0x8d, 0x84, 0x80, 0x7e,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x83, 0x80, 0x80, 0x87, 0x91, 0x96, 0x92, 0x8c,
+ 0x8d, 0x86, 0x80, 0x80, 0x83, 0x81, 0x79, 0x71,
+ 0x79, 0x7b, 0x7e, 0x80, 0x80, 0x7e, 0x7b, 0x79,
+ 0x79, 0x7d, 0x80, 0x7e, 0x7b, 0x7f, 0x8a, 0x95,
+ 0xb2, 0xb6, 0xb5, 0xa9, 0x96, 0x85, 0x7d, 0x7c,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x81, 0x7e, 0x80, 0x8e, 0xa1, 0xab, 0xa7, 0x9f,
+ 0x9c, 0x8e, 0x80, 0x7d, 0x83, 0x85, 0x7f, 0x78,
+ 0x7f, 0x7e, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, 0x7f,
+ 0x7a, 0x80, 0x84, 0x80, 0x7b, 0x7f, 0x8f, 0x9e,
+ 0xc0, 0xc6, 0xc6, 0xb7, 0x9c, 0x86, 0x7d, 0x7c,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7f, 0x7c, 0x81, 0x95, 0xb0, 0xbf, 0xbd, 0xb4,
+ 0xa0, 0x8e, 0x7b, 0x79, 0x84, 0x8f, 0x8f, 0x8a,
+ 0x8c, 0x88, 0x82, 0x7e, 0x7e, 0x82, 0x88, 0x8c,
+ 0x85, 0x8c, 0x8e, 0x85, 0x79, 0x7a, 0x8b, 0x9c,
+ 0xbc, 0xc5, 0xc9, 0xb9, 0x9e, 0x87, 0x80, 0x81,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7e, 0x7b, 0x81, 0x9a, 0xb9, 0xcc, 0xca, 0xc1,
+ 0x9f, 0x8a, 0x76, 0x75, 0x86, 0x97, 0x9d, 0x9b,
+ 0x97, 0x91, 0x88, 0x81, 0x81, 0x88, 0x91, 0x97,
+ 0x91, 0x97, 0x96, 0x88, 0x77, 0x75, 0x84, 0x95,
+ 0xb2, 0xbd, 0xc4, 0xb7, 0x9d, 0x88, 0x83, 0x87,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0x9b, 0x84, 0x82, 0x88, 0x8d, 0xaf, 0xc8, 0xba,
+ 0xbd, 0xac, 0x94, 0x82, 0x82, 0x92, 0xa9, 0xba,
+ 0xc0, 0xb8, 0xa8, 0x93, 0x81, 0x7a, 0x7e, 0x84,
+ 0xa7, 0xab, 0xaa, 0x9d, 0x89, 0x7d, 0x7e, 0x83,
+ 0x81, 0x7f, 0x7d, 0x7c, 0x7c, 0x7e, 0x81, 0x83,
+ 0x7e, 0x7e, 0x7f, 0x7f, 0x80, 0x81, 0x81, 0x81,
+ 0x83, 0x82, 0x80, 0x80, 0x80, 0x81, 0x83, 0x85,
+ 0x7f, 0x82, 0x87, 0x89, 0x86, 0x7f, 0x77, 0x72,
+ 0x84, 0x84, 0x84, 0x84, 0x83, 0x81, 0x7f, 0x7e,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xa3, 0x84, 0x7b, 0x7d, 0x83, 0xa7, 0xc7, 0xc2,
+ 0xc0, 0xaa, 0x8b, 0x74, 0x74, 0x89, 0xa8, 0xbd,
+ 0xcc, 0xc4, 0xb4, 0x9e, 0x8a, 0x80, 0x82, 0x87,
+ 0x88, 0x90, 0x93, 0x8c, 0x7f, 0x79, 0x7d, 0x85,
+ 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7d, 0x7c, 0x7c,
+ 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f,
+ 0x7a, 0x7b, 0x7d, 0x7e, 0x7f, 0x7f, 0x7e, 0x7d,
+ 0x7e, 0x7d, 0x7c, 0x7c, 0x7c, 0x7d, 0x7f, 0x80,
+ 0x7d, 0x7d, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x84, 0x78, 0x7c, 0x81, 0xa3, 0xc6, 0xc8,
+ 0xc3, 0xab, 0x89, 0x71, 0x70, 0x87, 0xa8, 0xc0,
+ 0xc0, 0xb9, 0xaa, 0x95, 0x81, 0x76, 0x76, 0x7a,
+ 0x72, 0x7b, 0x83, 0x81, 0x78, 0x76, 0x7d, 0x86,
+ 0x78, 0x7d, 0x83, 0x88, 0x89, 0x87, 0x82, 0x7f,
+ 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, 0x85, 0x85,
+ 0x78, 0x7c, 0x82, 0x87, 0x88, 0x86, 0x81, 0x7e,
+ 0x7e, 0x7b, 0x78, 0x78, 0x7c, 0x85, 0x8e, 0x94,
+ 0x7e, 0x7d, 0x7b, 0x7a, 0x7c, 0x7e, 0x81, 0x83,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xaa, 0x84, 0x80, 0x91, 0x96, 0xaa, 0xc4, 0xc5,
+ 0xc2, 0xb2, 0x9a, 0x89, 0x88, 0x98, 0xaf, 0xbf,
+ 0xba, 0xb6, 0xab, 0x9b, 0x8c, 0x84, 0x85, 0x8a,
+ 0x83, 0x8c, 0x92, 0x8c, 0x80, 0x79, 0x7c, 0x83,
+ 0x7b, 0x84, 0x91, 0x9d, 0xa2, 0xa0, 0x9b, 0x96,
+ 0x94, 0x95, 0x95, 0x96, 0x96, 0x97, 0x97, 0x97,
+ 0x87, 0x8e, 0x97, 0x9f, 0xa1, 0x9d, 0x96, 0x91,
+ 0x81, 0x83, 0x87, 0x8d, 0x95, 0x9d, 0xa3, 0xa7,
+ 0x92, 0x8d, 0x86, 0x7f, 0x7c, 0x7c, 0x7f, 0x81,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xa4, 0x83, 0x8f, 0xb2, 0xb7, 0xb8, 0xc1, 0xbd,
+ 0xbf, 0xbb, 0xb5, 0xb0, 0xaf, 0xb3, 0xb8, 0xbc,
+ 0xc7, 0xc7, 0xc4, 0xbc, 0xb3, 0xb0, 0xb5, 0xba,
+ 0xae, 0xb4, 0xb4, 0xa7, 0x91, 0x81, 0x7d, 0x7f,
+ 0x89, 0x93, 0xa4, 0xb4, 0xbd, 0xbe, 0xba, 0xb6,
+ 0xad, 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0,
+ 0xa0, 0xa7, 0xb1, 0xba, 0xbd, 0xba, 0xb3, 0xaf,
+ 0x8b, 0x94, 0xa1, 0xae, 0xb6, 0xb8, 0xb6, 0xb4,
+ 0xb1, 0xa8, 0x99, 0x89, 0x7f, 0x7b, 0x7c, 0x7e,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xa3, 0x82, 0x98, 0xc8, 0xcc, 0xbf, 0xbe, 0xba,
+ 0xbf, 0xc1, 0xc5, 0xc8, 0xc7, 0xc4, 0xbf, 0xbb,
+ 0xc0, 0xc4, 0xc5, 0xc2, 0xbe, 0xbe, 0xc4, 0xca,
+ 0xca, 0xce, 0xcb, 0xba, 0x9f, 0x89, 0x80, 0x80,
+ 0x9e, 0xa7, 0xb5, 0xc2, 0xc9, 0xca, 0xc7, 0xc4,
+ 0xbd, 0xbd, 0xbe, 0xbe, 0xbf, 0xbf, 0xc0, 0xc0,
+ 0xb0, 0xb5, 0xbd, 0xc4, 0xc8, 0xc7, 0xc5, 0xc2,
+ 0xa1, 0xa8, 0xb3, 0xbd, 0xc3, 0xc3, 0xc0, 0xbd,
+ 0xc8, 0xbb, 0xa6, 0x91, 0x82, 0x7c, 0x7c, 0x7e,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x82, 0x94, 0xc6, 0xca, 0xbb, 0xbd, 0xc0,
+ 0xc1, 0xc2, 0xc4, 0xc4, 0xc4, 0xc2, 0xc0, 0xbe,
+ 0xb5, 0xb9, 0xbc, 0xba, 0xb5, 0xb4, 0xb8, 0xbd,
+ 0xc2, 0xc8, 0xc8, 0xb9, 0xa1, 0x8d, 0x85, 0x86,
+ 0xb4, 0xb8, 0xbe, 0xc2, 0xc4, 0xc1, 0xbd, 0xba,
+ 0xbf, 0xbf, 0xc0, 0xc0, 0xc1, 0xc2, 0xc2, 0xc2,
+ 0xae, 0xb0, 0xb4, 0xb9, 0xbd, 0xc0, 0xc2, 0xc3,
+ 0xba, 0xb7, 0xb2, 0xb0, 0xb2, 0xb8, 0xc0, 0xc5,
+ 0xcc, 0xbe, 0xa7, 0x90, 0x82, 0x7d, 0x80, 0x83,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xb3, 0x82, 0x8d, 0xbc, 0xc0, 0xb3, 0xbd, 0xc8,
+ 0xc5, 0xc0, 0xbb, 0xb6, 0xb5, 0xb9, 0xbe, 0xc1,
+ 0xc1, 0xc5, 0xc7, 0xc4, 0xbd, 0xba, 0xbc, 0xc0,
+ 0xae, 0xb6, 0xba, 0xb0, 0x9d, 0x8d, 0x89, 0x8b,
+ 0xc2, 0xc2, 0xc1, 0xbe, 0xb9, 0xb3, 0xad, 0xaa,
+ 0xbb, 0xbb, 0xbc, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe,
+ 0xa5, 0xa5, 0xa7, 0xaa, 0xae, 0xb4, 0xb9, 0xbc,
+ 0xcc, 0xbe, 0xaa, 0x9c, 0x9c, 0xa9, 0xbc, 0xc9,
+ 0xc7, 0xb9, 0xa2, 0x8d, 0x80, 0x7e, 0x83, 0x88,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc4, 0xaf, 0x83, 0x87, 0x86, 0x81, 0xad, 0xc1,
+ 0xc0, 0xc0, 0xba, 0x8f, 0x84, 0x85, 0x7a, 0xa7,
+ 0xc0, 0xbf, 0xc2, 0xbb, 0x9e, 0x83, 0x8d, 0xa9,
+ 0xc4, 0xca, 0xca, 0xbb, 0xa1, 0x8b, 0x82, 0x81,
+ 0x8a, 0x96, 0xa8, 0xba, 0xc4, 0xc6, 0xc2, 0xbe,
+ 0x83, 0x88, 0x86, 0x86, 0x9e, 0xbf, 0xc8, 0xbd,
+ 0xc4, 0xa4, 0x89, 0x82, 0x82, 0x89, 0xa4, 0xc4,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc4, 0xaf, 0x83, 0x87, 0x86, 0x81, 0xac, 0xc1,
+ 0xc0, 0xc0, 0xba, 0x8e, 0x84, 0x84, 0x7a, 0xa6,
+ 0xc0, 0xbe, 0xc0, 0xb9, 0x9d, 0x84, 0x91, 0xaf,
+ 0xbc, 0xc2, 0xc3, 0xb6, 0x9e, 0x89, 0x81, 0x80,
+ 0x80, 0x8e, 0xa4, 0xb8, 0xc4, 0xc5, 0xc0, 0xbb,
+ 0x82, 0x87, 0x84, 0x84, 0x9c, 0xbe, 0xc7, 0xbc,
+ 0xc3, 0xa3, 0x87, 0x80, 0x80, 0x87, 0xa3, 0xc3,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc3, 0xaf, 0x82, 0x86, 0x86, 0x81, 0xac, 0xc0,
+ 0xbf, 0xc0, 0xb9, 0x8e, 0x83, 0x84, 0x7a, 0xa6,
+ 0xc1, 0xbd, 0xbd, 0xb6, 0x9b, 0x86, 0x96, 0xb6,
+ 0xb6, 0xbc, 0xbe, 0xb1, 0x99, 0x85, 0x7d, 0x7c,
+ 0x75, 0x85, 0x9e, 0xb6, 0xc3, 0xc4, 0xbd, 0xb7,
+ 0x81, 0x85, 0x82, 0x81, 0x99, 0xbb, 0xc5, 0xba,
+ 0xc1, 0xa1, 0x85, 0x7e, 0x7e, 0x85, 0xa1, 0xc1,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc3, 0xae, 0x82, 0x86, 0x85, 0x80, 0xab, 0xc0,
+ 0xbf, 0xbf, 0xb9, 0x8d, 0x83, 0x83, 0x79, 0xa5,
+ 0xc1, 0xbd, 0xbc, 0xb4, 0x9a, 0x86, 0x97, 0xb9,
+ 0xb8, 0xbe, 0xbf, 0xb0, 0x96, 0x81, 0x77, 0x76,
+ 0x71, 0x82, 0x9d, 0xb5, 0xc2, 0xc2, 0xba, 0xb3,
+ 0x80, 0x84, 0x80, 0x7f, 0x96, 0xb9, 0xc4, 0xba,
+ 0xbf, 0x9f, 0x84, 0x7d, 0x7d, 0x84, 0x9f, 0xbf,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc2, 0xad, 0x81, 0x85, 0x84, 0x7f, 0xab, 0xbf,
+ 0xbe, 0xbf, 0xb8, 0x8d, 0x82, 0x83, 0x79, 0xa5,
+ 0xbf, 0xbc, 0xbd, 0xb5, 0x9b, 0x85, 0x93, 0xb3,
+ 0xbf, 0xc5, 0xc4, 0xb5, 0x9a, 0x83, 0x79, 0x78,
+ 0x7b, 0x8a, 0xa1, 0xb6, 0xc1, 0xc1, 0xb9, 0xb3,
+ 0x81, 0x84, 0x7f, 0x7d, 0x95, 0xb8, 0xc4, 0xba,
+ 0xbf, 0x9f, 0x83, 0x7c, 0x7c, 0x83, 0x9f, 0xbf,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc2, 0xad, 0x81, 0x84, 0x84, 0x7f, 0xaa, 0xbe,
+ 0xbd, 0xbe, 0xb7, 0x8c, 0x82, 0x82, 0x78, 0xa4,
+ 0xbd, 0xbc, 0xc0, 0xba, 0x9d, 0x82, 0x8a, 0xa6,
+ 0xc0, 0xc7, 0xc9, 0xbd, 0xa6, 0x92, 0x8b, 0x8b,
+ 0x91, 0x9c, 0xab, 0xb9, 0xc1, 0xc0, 0xba, 0xb5,
+ 0x82, 0x85, 0x7f, 0x7c, 0x94, 0xb8, 0xc5, 0xbc,
+ 0xbf, 0x9f, 0x84, 0x7d, 0x7d, 0x84, 0x9f, 0xbf,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc1, 0xac, 0x80, 0x84, 0x83, 0x7e, 0xaa, 0xbe,
+ 0xbd, 0xbe, 0xb7, 0x8c, 0x81, 0x82, 0x78, 0xa4,
+ 0xba, 0xbc, 0xc4, 0xbf, 0xa0, 0x7e, 0x80, 0x97,
+ 0xb6, 0xc1, 0xc9, 0xc4, 0xb5, 0xa9, 0xa7, 0xab,
+ 0xab, 0xb0, 0xb7, 0xbd, 0xc0, 0xbf, 0xbc, 0xb9,
+ 0x84, 0x86, 0x80, 0x7d, 0x94, 0xb9, 0xc7, 0xbe,
+ 0xc0, 0xa0, 0x85, 0x7e, 0x7e, 0x85, 0xa0, 0xc0,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7c, 0x85, 0x7f, 0x8f, 0xbb, 0xc7, 0xbb, 0xbf,
+ 0xab, 0x7c, 0x82, 0x7f, 0x89, 0xb6, 0xbf, 0xc1,
+ 0xc1, 0xac, 0x80, 0x84, 0x83, 0x7e, 0xaa, 0xbe,
+ 0xbd, 0xbd, 0xb7, 0x8b, 0x81, 0x81, 0x77, 0xa4,
+ 0xb8, 0xbc, 0xc6, 0xc3, 0xa2, 0x7c, 0x79, 0x8c,
+ 0xac, 0xb9, 0xc6, 0xc7, 0xc0, 0xbb, 0xbe, 0xc4,
+ 0xbc, 0xbd, 0xbf, 0xc0, 0xc0, 0xbf, 0xbd, 0xbc,
+ 0x86, 0x87, 0x80, 0x7d, 0x95, 0xba, 0xc8, 0xbf,
+ 0xc1, 0xa1, 0x86, 0x7f, 0x7f, 0x86, 0xa1, 0xc1,
+ 0xc0, 0xc4, 0xb3, 0x90, 0x7c, 0x80, 0x83, 0x7c,
+ 0x7f, 0x7b, 0x82, 0x99, 0xb8, 0xca, 0xc8, 0xbf,
+ 0xa9, 0x91, 0x7a, 0x7a, 0x93, 0xb1, 0xc4, 0xca,
+ 0xc3, 0xae, 0x90, 0x7c, 0x7c, 0x90, 0xae, 0xc3,
+ 0xc8, 0xc3, 0xb3, 0x96, 0x7c, 0x78, 0x8a, 0x9e,
+ 0xbe, 0xc7, 0xca, 0xb9, 0x9b, 0x85, 0x7f, 0x83,
+ 0xa1, 0xac, 0xbc, 0xc9, 0xcd, 0xc7, 0xbd, 0xb5,
+ 0xc3, 0xbf, 0xbb, 0xbd, 0xc2, 0xc4, 0xc0, 0xbc,
+ 0x84, 0x84, 0x88, 0x8f, 0x9d, 0xae, 0xbe, 0xc8,
+ 0xb7, 0xa6, 0x8f, 0x7e, 0x7e, 0x8f, 0xa6, 0xb7,
+ 0xc4, 0xbb, 0xab, 0x9a, 0x8c, 0x84, 0x81, 0x80,
+ 0x7f, 0x7c, 0x80, 0x94, 0xae, 0xbd, 0xba, 0xb1,
+ 0xa1, 0x8d, 0x7a, 0x7a, 0x8f, 0xa8, 0xb6, 0xb9,
+ 0xb6, 0xa5, 0x8d, 0x7c, 0x7c, 0x8d, 0xa5, 0xb6,
+ 0xb8, 0xb6, 0xab, 0x94, 0x7d, 0x78, 0x87, 0x98,
+ 0xb0, 0xb9, 0xbd, 0xaf, 0x96, 0x83, 0x7f, 0x83,
+ 0x7f, 0x8b, 0x9d, 0xb0, 0xbc, 0xc1, 0xc1, 0xbf,
+ 0xad, 0xa9, 0xa5, 0xa7, 0xac, 0xae, 0xaa, 0xa6,
+ 0x80, 0x80, 0x81, 0x87, 0x92, 0x9f, 0xad, 0xb5,
+ 0xa8, 0x9b, 0x88, 0x7a, 0x7a, 0x88, 0x9b, 0xa8,
+ 0xb3, 0xaa, 0x9d, 0x8f, 0x85, 0x7f, 0x7e, 0x7e,
+ 0x80, 0x7c, 0x7e, 0x8c, 0x9e, 0xa7, 0xa3, 0x9b,
+ 0x95, 0x86, 0x79, 0x7b, 0x8a, 0x99, 0xa0, 0xa0,
+ 0xa3, 0x98, 0x88, 0x7d, 0x7d, 0x88, 0x98, 0xa3,
+ 0xa0, 0xa2, 0x9d, 0x8f, 0x7e, 0x7a, 0x83, 0x8f,
+ 0x9a, 0xa3, 0xa7, 0x9e, 0x8d, 0x80, 0x7f, 0x83,
+ 0x6f, 0x77, 0x84, 0x93, 0xa0, 0xa9, 0xae, 0xb0,
+ 0x92, 0x8e, 0x8a, 0x8c, 0x91, 0x92, 0x8f, 0x8b,
+ 0x7d, 0x7c, 0x7b, 0x7e, 0x84, 0x8c, 0x95, 0x9a,
+ 0x94, 0x8c, 0x80, 0x78, 0x78, 0x80, 0x8c, 0x94,
+ 0x9a, 0x94, 0x8c, 0x83, 0x7e, 0x7b, 0x7c, 0x7c,
+ 0x81, 0x7d, 0x7d, 0x84, 0x8e, 0x92, 0x8d, 0x87,
+ 0x88, 0x81, 0x7a, 0x7b, 0x84, 0x8a, 0x8a, 0x88,
+ 0x90, 0x8b, 0x83, 0x7e, 0x7e, 0x83, 0x8b, 0x90,
+ 0x89, 0x8d, 0x8f, 0x89, 0x7f, 0x7c, 0x80, 0x86,
+ 0x87, 0x8d, 0x92, 0x8e, 0x84, 0x7e, 0x7e, 0x82,
+ 0x80, 0x81, 0x81, 0x83, 0x85, 0x87, 0x89, 0x8a,
+ 0x83, 0x7e, 0x7b, 0x7d, 0x81, 0x83, 0x80, 0x7b,
+ 0x7e, 0x7d, 0x7b, 0x7b, 0x7c, 0x7f, 0x83, 0x85,
+ 0x85, 0x82, 0x7e, 0x7b, 0x7b, 0x7e, 0x82, 0x85,
+ 0x87, 0x85, 0x81, 0x7e, 0x7c, 0x7d, 0x7e, 0x80,
+ 0x81, 0x7f, 0x7e, 0x80, 0x82, 0x82, 0x7f, 0x7b,
+ 0x80, 0x7d, 0x7c, 0x7d, 0x7f, 0x80, 0x7d, 0x7a,
+ 0x83, 0x82, 0x80, 0x7e, 0x7e, 0x80, 0x82, 0x83,
+ 0x7c, 0x80, 0x83, 0x83, 0x80, 0x7e, 0x7e, 0x80,
+ 0x7c, 0x7f, 0x82, 0x82, 0x7f, 0x7d, 0x7e, 0x80,
+ 0x8b, 0x88, 0x83, 0x7d, 0x7a, 0x78, 0x78, 0x78,
+ 0x81, 0x7d, 0x7a, 0x7c, 0x80, 0x82, 0x7f, 0x7a,
+ 0x82, 0x81, 0x7f, 0x7e, 0x7d, 0x7c, 0x7c, 0x7c,
+ 0x7f, 0x80, 0x81, 0x82, 0x82, 0x81, 0x80, 0x7f,
+ 0x7f, 0x7f, 0x7f, 0x7f, 0x80, 0x82, 0x83, 0x84,
+ 0x80, 0x80, 0x80, 0x7f, 0x7d, 0x7c, 0x7b, 0x7b,
+ 0x7d, 0x7e, 0x7e, 0x7e, 0x7d, 0x7b, 0x7a, 0x79,
+ 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7e, 0x7e,
+ 0x7a, 0x7b, 0x7c, 0x7e, 0x7f, 0x7f, 0x7e, 0x7d,
+ 0x7b, 0x7b, 0x7c, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e,
+ 0x7f, 0x7e, 0x7d, 0x7d, 0x7d, 0x7f, 0x81, 0x82,
+ 0x84, 0x80, 0x7d, 0x7f, 0x83, 0x85, 0x82, 0x7d,
+ 0x82, 0x82, 0x83, 0x82, 0x81, 0x7f, 0x7d, 0x7b,
+ 0x7f, 0x81, 0x83, 0x85, 0x85, 0x83, 0x81, 0x7f,
+ 0x7d, 0x7e, 0x81, 0x83, 0x84, 0x84, 0x84, 0x84,
+ 0x80, 0x82, 0x83, 0x81, 0x7d, 0x7c, 0x7e, 0x80,
+ 0x7e, 0x80, 0x82, 0x80, 0x7d, 0x7b, 0x7d, 0x80,
+ 0x7e, 0x7f, 0x7f, 0x80, 0x80, 0x7f, 0x7f, 0x7e,
+ 0x7f, 0x7c, 0x7a, 0x7b, 0x7f, 0x81, 0x80, 0x7e,
+ 0x81, 0x7e, 0x7c, 0x7c, 0x7f, 0x80, 0x7e, 0x7c,
+ 0x77, 0x79, 0x7c, 0x7f, 0x82, 0x84, 0x84, 0x84,
+ 0x83, 0x7f, 0x7c, 0x7d, 0x82, 0x84, 0x81, 0x7c,
+ 0x7d, 0x7f, 0x81, 0x83, 0x83, 0x81, 0x7e, 0x7c,
+ 0x7e, 0x7f, 0x81, 0x82, 0x82, 0x81, 0x7f, 0x7e,
+ 0x7d, 0x7f, 0x81, 0x83, 0x83, 0x82, 0x7f, 0x7e,
+ 0x7f, 0x82, 0x85, 0x83, 0x7e, 0x7d, 0x81, 0x86,
+ 0x7f, 0x82, 0x83, 0x81, 0x7d, 0x7c, 0x81, 0x86,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x84, 0x7f, 0x79, 0x79, 0x7e, 0x82, 0x81, 0x7f,
+ 0x87, 0x82, 0x7d, 0x7d, 0x80, 0x82, 0x7f, 0x7b,
+ 0x7e, 0x80, 0x82, 0x83, 0x82, 0x7f, 0x7b, 0x78,
+ 0x80, 0x7b, 0x78, 0x7a, 0x7f, 0x80, 0x7d, 0x79,
+ 0x78, 0x7a, 0x7e, 0x82, 0x82, 0x81, 0x7e, 0x7c,
+ 0x7d, 0x7d, 0x7e, 0x7f, 0x7f, 0x7e, 0x7d, 0x7d,
+ 0x7c, 0x7e, 0x81, 0x82, 0x81, 0x7e, 0x7a, 0x77,
+};
+
+
+UWORD8 gau1_ihevcd_logo_uv[10240] =
+{
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+};
+
+const UWORD16 gau2_ihevcd_logo_rgb565[10240] =
+{
+
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffd, 0xfffd, 0xffde, 0xffde, 0xffdd, 0xffdd,
+ 0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffd, 0xfffd, 0xffdd, 0xffdd,
+ 0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xefff, 0xefff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffde, 0xffde, 0xffdd, 0xffdd,
+ 0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffd, 0xfffd, 0xffdd, 0xffdd,
+ 0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xffdc, 0xffdc, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xf7df, 0xf7df, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xefff, 0xefff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffdf, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffdf, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xefff, 0xefff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffde, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffde,
+ 0xffde, 0xffbe, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf,
+ 0xf7df, 0xf7df, 0xf7df, 0xf7df, 0xf7df, 0xf7df, 0xefff, 0xefff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffde, 0xffde, 0xffbe, 0xffbe, 0xffbe, 0xffbe,
+ 0xffbe, 0xffdf, 0xffbe, 0xffbe, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffdf, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffbd, 0xffbd,
+ 0xffbd, 0xffbd, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xffde, 0xffde, 0xff78, 0xff16, 0xfe73, 0xf612, 0xf5f2, 0xfe73,
+ 0xff17, 0xff79, 0xffbb, 0xffbc, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffdf, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xffde, 0xffde, 0xfef6, 0xee12, 0xcccd, 0xb3ea, 0xb3ea, 0xcccd,
+ 0xee13, 0xfef6, 0xffbb, 0xffbb, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xf7bd, 0xffff, 0xffff, 0xfffe, 0xfffc, 0xfffc, 0xffb9, 0xffb9,
+ 0xff98, 0xffb8, 0xffb9, 0xffb9, 0xffda, 0xffda, 0xffdb, 0xffbb, 0xfffd, 0xffbc,
+ 0xffde, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xfffe, 0xfffe, 0xffde, 0xefdf, 0xefdf,
+ 0xe7ff, 0xe7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xffdf, 0xffff,
+ 0xffdf, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xefff, 0xf7ff,
+ 0xffff, 0xf7df, 0xffdf, 0xffff, 0xffff, 0xffff, 0xefdf, 0xf7ff, 0xffff, 0xffdf,
+ 0xff7b, 0xffdc, 0xff9a, 0xfef7, 0xdbc2, 0xe403, 0xf423, 0xf464, 0xf464, 0xec43,
+ 0xd425, 0xcbe5, 0xf652, 0xff56, 0xfffc, 0xffdb, 0xfffe, 0xffff, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xf7de, 0xffff, 0xffff, 0xfffe, 0xfffc, 0xf75a, 0xff17, 0xff17,
+ 0xff16, 0xff36, 0xff37, 0xff37, 0xff38, 0xff38, 0xff18, 0xff7a, 0xfffd, 0xfffd,
+ 0xfffe, 0xffff, 0xffde, 0xffff, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xe7ff, 0xe7ff, 0xefff, 0xefff, 0xf7df, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7df, 0xefff, 0xefff, 0xefdf, 0xefff,
+ 0xffff, 0xffff, 0xffff, 0xffdf, 0xf7df, 0xf7df, 0xf7ff, 0xf7ff, 0xffdf, 0xffdf,
+ 0xffdc, 0xffdd, 0xfeb6, 0xcd10, 0xe403, 0xe424, 0xf443, 0xfc64, 0xf484, 0xf444,
+ 0xdc46, 0xd425, 0xc4ac, 0xfe93, 0xfffc, 0xfffc, 0xf7fe, 0xf7de, 0xf7ff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xf7fe, 0xffff, 0xfffe, 0xfffe, 0xff99, 0xe674, 0xed4c, 0xed4c,
+ 0xfd49, 0xfd6a, 0xfd6b, 0xfd6a, 0xed6c, 0xe56c, 0xdd2e, 0xfe32, 0xffdb, 0xffdb,
+ 0xffde, 0xffff, 0xf7ff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xfffd, 0xffff, 0xffff,
+ 0xf7ff, 0xefff, 0xffff, 0xffde, 0xffff, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xf7df,
+ 0xf7ff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7fe, 0xfffe, 0xffff, 0xf7ff, 0xf7ff,
+ 0xfffe, 0xffff, 0xfffe, 0xffdd, 0xffde, 0xffff, 0xf7ff, 0xf7ff, 0xf7de, 0xffff,
+ 0xffdb, 0xffdb, 0xfdf1, 0xb3c8, 0xfc20, 0xfc20, 0xfc40, 0xfc40, 0xfc40, 0xfc40,
+ 0xf442, 0xf442, 0xabc6, 0xf5ee, 0xfffb, 0xfffc, 0xf7fe, 0xf7dd, 0xf7fe, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xff99, 0xd5d2, 0xc407, 0xc427,
+ 0xdc25, 0xdc45, 0xd446, 0xd446, 0xc447, 0xc427, 0xbc2a, 0xed8f, 0xff9a, 0xffdb,
+ 0xffde, 0xfffe, 0xf7ff, 0xffff, 0xfffe, 0xffde, 0xffdd, 0xffbc, 0xf7de, 0xffde,
+ 0xefff, 0xefff, 0xffff, 0xf7de, 0xffff, 0xffff, 0xf7df, 0xf7ff, 0xffff, 0xf7df,
+ 0xffff, 0xf7df, 0xf7df, 0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xffde, 0xffde, 0xffdd, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffdb, 0xff9a, 0xfdf1, 0xbc09, 0xfc40, 0xfc41, 0xfc40, 0xfc40, 0xfc40, 0xfc40,
+ 0xfc63, 0xfc63, 0xbc48, 0xf60f, 0xff9a, 0xffdb, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xf7ff, 0xffde, 0xffff, 0xff99, 0xdd90, 0xe405, 0xe425,
+ 0xfc21, 0xfc22, 0xfc42, 0xfc22, 0xec23, 0xec03, 0xcc69, 0xfdcf, 0xffdb, 0xffdb,
+ 0xf7ff, 0xf7ff, 0xefff, 0xf7ff, 0xffbd, 0xffde, 0xffbb, 0xffbb, 0xffba, 0xffba,
+ 0xffdb, 0xfffb, 0xffba, 0xffba, 0xffdc, 0xfffd, 0xffde, 0xffde, 0xffff, 0xffff,
+ 0xffff, 0xf7df, 0xffbd, 0xffde, 0xfffd, 0xffdc, 0xffbb, 0xffbb, 0xffdc, 0xfffd,
+ 0xffdb, 0xff9a, 0xff9a, 0xffdb, 0xfffd, 0xffbc, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xffdc, 0xff9a, 0xfe11, 0xc44a, 0xfc42, 0xfc42, 0xfc40, 0xfc40, 0xfc40, 0xfc20,
+ 0xfc62, 0xfc62, 0xcc89, 0xfe0f, 0xff7a, 0xffbb, 0xfffe, 0xffff, 0xffde, 0xffde,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffff, 0xffb9, 0xdd91, 0xec46, 0xec66,
+ 0xfc62, 0xfc83, 0xfc83, 0xfc63, 0xf465, 0xf464, 0xd48a, 0xfdef, 0xffdb, 0xffdb,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffde, 0xffde, 0xffbb, 0xffbb, 0xffba, 0xffba,
+ 0xffbb, 0xffba, 0xffba, 0xffdb, 0xff9b, 0xff9b, 0xfffe, 0xfffe, 0xf7df, 0xffff,
+ 0xffff, 0xf7df, 0xffde, 0xfffe, 0xfffd, 0xffbc, 0xffba, 0xffdb, 0xffbc, 0xfffd,
+ 0xffdb, 0xffdb, 0xffba, 0xffba, 0xffdd, 0xffdc, 0xf7ff, 0xefff, 0xf7ff, 0xf7ff,
+ 0xffdc, 0xffbb, 0xf611, 0xb409, 0xfc22, 0xfc42, 0xfc20, 0xfc20, 0xfc20, 0xfc20,
+ 0xfc42, 0xf442, 0xbc27, 0xf5ef, 0xff9a, 0xffdb, 0xffde, 0xffde, 0xffde, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffdf, 0xf7ff, 0xf7ff, 0xffbf, 0xffff, 0xff99, 0xe571, 0xf404, 0xf424,
+ 0xfc00, 0xfc00, 0xfc20, 0xfc00, 0xfc22, 0xfc01, 0xcc69, 0xf5cf, 0xffdb, 0xfffc,
+ 0xefff, 0xefff, 0xefff, 0xefff, 0xfffe, 0xff7c, 0xfeb5, 0xfe53, 0xfe52, 0xfe72,
+ 0xfe52, 0xfe31, 0xfe73, 0xfed4, 0xe634, 0xee54, 0xffbc, 0xfffd, 0xf79d, 0xffff,
+ 0xffbe, 0xffde, 0xffdd, 0xffbc, 0xff17, 0xf675, 0xfe73, 0xfeb4, 0xf695, 0xf694,
+ 0xfe93, 0xfeb4, 0xfe73, 0xf632, 0xeef8, 0xfffc, 0xf7ff, 0xefff, 0xefff, 0xefff,
+ 0xfffd, 0xfffd, 0xf673, 0xbc8c, 0xe405, 0xec25, 0xfc22, 0xfc42, 0xfc21, 0xfc20,
+ 0xf423, 0xec02, 0xc469, 0xfe31, 0xffbc, 0xffdc, 0xffde, 0xffbe, 0xffbe, 0xffdf,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7df, 0xf7ff, 0xf7ff, 0xffbf, 0xffff, 0xff99, 0xe570, 0xf424, 0xfc25,
+ 0xfc00, 0xfc00, 0xfc20, 0xfc20, 0xfc22, 0xfc22, 0xcc8a, 0xfe10, 0xfffc, 0xfffc,
+ 0xefff, 0xefff, 0xefff, 0xefff, 0xfffe, 0xe6da, 0xd54f, 0xbc8c, 0xcc8a, 0xd4cb,
+ 0xd4cc, 0xccab, 0xbc8c, 0xcced, 0xac4c, 0xbcce, 0xff5b, 0xfffd, 0xf79d, 0xffff,
+ 0xffde, 0xffff, 0xffdd, 0xff5b, 0xddf3, 0xbcce, 0xbc6b, 0xcced, 0xc52f, 0xb4ad,
+ 0xc4ac, 0xcd0d, 0xbcac, 0xb44a, 0xc5d3, 0xffdb, 0xf7ff, 0xf7ff, 0xefff, 0xefff,
+ 0xfffd, 0xfffd, 0xfed5, 0xd56f, 0xe404, 0xe425, 0xfc22, 0xfc43, 0xfc41, 0xfc20,
+ 0xec03, 0xebe2, 0xe54d, 0xfeb3, 0xffdc, 0xffdc, 0xffdf, 0xffff, 0xffbe, 0xffbe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xccc8, 0xee0c, 0xfffd, 0xfffd,
+ 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffd9, 0xf716, 0xfd26, 0xdc22, 0xfbe0, 0xfc20,
+ 0xfc06, 0xfc06, 0xfc21, 0xfc21, 0xdc24, 0xe486, 0xff35, 0xffb7, 0xfffc, 0xfffd,
+ 0xfffd, 0xfffd, 0xffd9, 0xff77, 0xfd8c, 0xdc67, 0xf402, 0xfc23, 0xfc22, 0xfc22,
+ 0xf441, 0xf441, 0xdca4, 0xc3e1, 0xddef, 0xffb5, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xffff, 0xfffe, 0xfffa, 0xffda, 0xdc2a, 0xdc4b, 0xe465, 0xe465, 0xe485, 0xdc65,
+ 0xcc6d, 0xcc6c, 0xff9d, 0xff9d, 0xf7fc, 0xf7fd, 0xfffd, 0xfffd, 0xffdf, 0xffdf,
+ 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xf7fe, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xf7fe, 0xffff, 0xf7fe, 0xf7de,
+ 0xffff, 0xffff, 0xf7de, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffde,
+ 0xffde, 0xffde, 0xffff, 0xffff, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffde, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffde, 0xffde, 0xffde, 0xffbd, 0xffbd, 0xff9d, 0xffde, 0xffff, 0xffff,
+ 0xefff, 0xefff, 0xe7ff, 0xefff, 0xfffe, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe,
+ 0xffde, 0xffbe, 0xffde, 0xffde, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xc4c7, 0xedec, 0xfffd, 0xffdd,
+ 0xffdf, 0xffdf, 0xffde, 0xffde, 0xfffa, 0xff16, 0xfd26, 0xe442, 0xfbe0, 0xfc20,
+ 0xfc06, 0xfc06, 0xfc21, 0xfc21, 0xdc24, 0xe466, 0xff15, 0xff97, 0xffdb, 0xffdc,
+ 0xfffc, 0xffdc, 0xffb9, 0xff57, 0xfd8c, 0xdc47, 0xf3e2, 0xfc23, 0xfc42, 0xfc42,
+ 0xfc61, 0xf441, 0xdca4, 0xcc01, 0xddef, 0xffb5, 0xfffd, 0xfffd, 0xf7ff, 0xf7ff,
+ 0xfffe, 0xf7fe, 0xffda, 0xffda, 0xfe32, 0xfdb0, 0xf528, 0xe4a6, 0xe4a6, 0xf528,
+ 0xf5d2, 0xfe54, 0xff9d, 0xffbd, 0xf7fd, 0xf7fd, 0xfffd, 0xfffd, 0xffdf, 0xffbf,
+ 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7df, 0xf7de, 0xf7ff, 0xf7ff,
+ 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffff, 0xffff, 0xf7ff, 0xf7de,
+ 0xffff, 0xfffe, 0xf7de, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xffde, 0xffde, 0xffbe, 0xffde, 0xffff, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffdf, 0xffbe, 0xffbe, 0xffbe, 0xffff, 0xffff,
+ 0xffde, 0xffde, 0xffde, 0xffde, 0xffbd, 0xffbd, 0xffbe, 0xffbe, 0xffdf, 0xffdf,
+ 0xefff, 0xefff, 0xe7ff, 0xe7ff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xd4c6, 0xfdeb, 0xfffd, 0xfffd,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffc, 0xef59, 0xf549, 0xd465, 0xfc00, 0xfc40,
+ 0xfc04, 0xfc04, 0xfc41, 0xfc41, 0xdc45, 0xe487, 0xff58, 0xffb9, 0xe7ff, 0xe7ff,
+ 0xe7ff, 0xe7ff, 0xffdb, 0xff79, 0xfd8c, 0xdc68, 0xfc02, 0xfc43, 0xfbe2, 0xfbe2,
+ 0xfc22, 0xfc01, 0xf465, 0xdba2, 0xf5b0, 0xff56, 0xffbf, 0xffbf, 0xffdf, 0xffdf,
+ 0xffdf, 0xffdf, 0xffde, 0xffbd, 0xffde, 0xff7d, 0xf6d5, 0xee73, 0xee72, 0xfed3,
+ 0xff7b, 0xffdd, 0xffdf, 0xffdf, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd, 0xffbf, 0xffbf,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xf7ff,
+ 0xffff, 0xf7ff, 0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7de, 0xf7fe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7df, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7df,
+ 0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xdcc6, 0xfdeb, 0xfffd, 0xfffd,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffc, 0xef39, 0xf569, 0xdc65, 0xfc00, 0xfc61,
+ 0xfc24, 0xfc24, 0xfc61, 0xfc61, 0xdc66, 0xeca7, 0xff58, 0xffda, 0xe7ff, 0xefff,
+ 0xefff, 0xe7ff, 0xfffb, 0xff9a, 0xfdac, 0xdc88, 0xfc22, 0xfc63, 0xfc02, 0xfc02,
+ 0xfc22, 0xfc02, 0xf465, 0xdba2, 0xf5b0, 0xff56, 0xffbf, 0xffbf, 0xffdf, 0xffdf,
+ 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffbe, 0xffbe, 0xffb8, 0xffb8, 0xffb7, 0xffb7,
+ 0xffbc, 0xffbc, 0xffdf, 0xffdf, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd, 0xffbf, 0xffbf,
+ 0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xf7fe,
+ 0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffff, 0xf7ff,
+ 0xffff, 0xf7ff, 0xf7bf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7de, 0xf7de, 0xf7fe,
+ 0xffff, 0xfffe, 0xf7de, 0xf7de, 0xf7df, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff,
+ 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xf7df, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xf7ff,
+ 0xf7df, 0xf7df, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe484, 0xfda9, 0xfffd, 0xffdc,
+ 0xf7ff, 0xefff, 0xf7ff, 0xf7ff, 0xfffd, 0xe75a, 0xed6b, 0xd487, 0xfc20, 0xfc61,
+ 0xfc22, 0xfc02, 0xfc41, 0xfc20, 0xdc45, 0xe487, 0xff59, 0xffda, 0xe7ff, 0xe7ff,
+ 0xe7ff, 0xdfff, 0xfffc, 0xff7a, 0xfd8c, 0xdc67, 0xfc01, 0xfc42, 0xfc42, 0xfc42,
+ 0xfc63, 0xfc62, 0xe4a6, 0xcbe3, 0xe5f0, 0xff96, 0xfffe, 0xfffe, 0xefff, 0xefff,
+ 0xefff, 0xefff, 0xf7fe, 0xf7fe, 0xe7ff, 0xe7ff, 0xf7fe, 0xf7fe, 0xfffb, 0xfffb,
+ 0xffde, 0xffde, 0xefff, 0xefff, 0xefff, 0xefff, 0xf7fe, 0xf7fe, 0xffbf, 0xffbe,
+ 0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xffbd, 0xffbd, 0xffdc, 0xffdc, 0xffdb, 0xffdb,
+ 0xffdc, 0xffdb, 0xffdc, 0xfffd, 0xfffd, 0xfffd, 0xffdf, 0xffff, 0xffbe, 0xffbe,
+ 0xffdf, 0xffde, 0xffbe, 0xffdf, 0xffbe, 0xffbe, 0xffbe, 0xffde, 0xffbe, 0xffde,
+ 0xffde, 0xffde, 0xf7ff, 0xefff, 0xefde, 0xf7fe, 0xfffd, 0xfffd, 0xffdb, 0xfffc,
+ 0xffdc, 0xffdb, 0xffdc, 0xffdc, 0xffde, 0xffde, 0xf7df, 0xf7df, 0xf7fe, 0xf7fe,
+ 0xf7ff, 0xf7ff, 0xefff, 0xefff, 0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xfffd, 0xfffd,
+ 0xffda, 0xffb9, 0xff98, 0xffd8, 0xf7de, 0xf7de, 0xf7de, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe484, 0xfda9, 0xfffd, 0xfffd,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffd, 0xef5a, 0xf56b, 0xd487, 0xfc20, 0xfc41,
+ 0xfc02, 0xfbe2, 0xfc41, 0xfc20, 0xdc46, 0xe4a7, 0xff59, 0xffdb, 0xe7ff, 0xe7ff,
+ 0xe7ff, 0xe7ff, 0xfffc, 0xff9a, 0xfdac, 0xdc67, 0xfc01, 0xfc42, 0xfc42, 0xfc42,
+ 0xfc63, 0xfc62, 0xe4a6, 0xcbe3, 0xe5d0, 0xff76, 0xfffe, 0xfffe, 0xefff, 0xefff,
+ 0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xe7ff, 0xe7ff, 0xf7fe, 0xf7fe, 0xfffb, 0xfffc,
+ 0xffff, 0xffff, 0xefff, 0xefff, 0xefff, 0xefff, 0xf7fe, 0xf7fe, 0xffbf, 0xffbf,
+ 0xefff, 0xf7ff, 0xffff, 0xf7ff, 0xffdd, 0xfffe, 0xfffc, 0xffbc, 0xffdb, 0xffdb,
+ 0xffdb, 0xffbb, 0xffdc, 0xffdc, 0xfffd, 0xfffd, 0xffdf, 0xffff, 0xffdf, 0xffbe,
+ 0xffdf, 0xffde, 0xffde, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xfffe,
+ 0xfffe, 0xfffe, 0xf7ff, 0xefff, 0xf7de, 0xf7ff, 0xfffd, 0xffdd, 0xffbb, 0xffbb,
+ 0xff9a, 0xffbb, 0xffdc, 0xffdc, 0xffde, 0xffde, 0xffff, 0xffff, 0xf7fe, 0xf7fe,
+ 0xf7ff, 0xf7ff, 0xefff, 0xefff, 0xefff, 0xefff, 0xffff, 0xf7ff, 0xffbc, 0xffdd,
+ 0xffb9, 0xff99, 0xff77, 0xffb8, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe485, 0xfdaa, 0xfffd, 0xfffc,
+ 0xffff, 0xffff, 0xffbb, 0xffbb, 0xffb8, 0xfed4, 0xfd29, 0xdc46, 0xfc21, 0xfc42,
+ 0xfc22, 0xfc02, 0xfc41, 0xfc41, 0xec24, 0xf485, 0xff15, 0xff96, 0xffba, 0xffbb,
+ 0xffdb, 0xffbb, 0xffb7, 0xff56, 0xfd8b, 0xec66, 0xfc21, 0xfc63, 0xfc62, 0xfc82,
+ 0xfc82, 0xf462, 0xe4a5, 0xcbe2, 0xedcd, 0xff73, 0xffd9, 0xffd9, 0xfffb, 0xfffb,
+ 0xfffb, 0xfffb, 0xfff9, 0xfff9, 0xffbd, 0xffdd, 0xffda, 0xffda, 0xffb9, 0xffb9,
+ 0xffba, 0xffba, 0xffde, 0xffde, 0xefff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdf, 0xffde,
+ 0xefdf, 0xf7ff, 0xfffe, 0xffde, 0xffda, 0xffda, 0xff56, 0xfed4, 0xfe30, 0xfe2f,
+ 0xfe2f, 0xfe0f, 0xfe30, 0xfe30, 0xf651, 0xf672, 0xfed5, 0xff58, 0xffb9, 0xffb9,
+ 0xff99, 0xff37, 0xfef6, 0xff37, 0xff36, 0xff37, 0xff37, 0xff37, 0xff37, 0xff37,
+ 0xff57, 0xff57, 0xfffb, 0xffdb, 0xffd9, 0xffd9, 0xff35, 0xfe72, 0xfdef, 0xfe0f,
+ 0xfdef, 0xfe0f, 0xfe30, 0xfe31, 0xf673, 0xfed4, 0xff78, 0xffda, 0xfffd, 0xfffd,
+ 0xfffd, 0xfffd, 0xffdc, 0xffdb, 0xff9a, 0xff99, 0xff16, 0xfe94, 0xfe10, 0xfe10,
+ 0xfe0f, 0xfe0e, 0xfded, 0xfe0d, 0xf6b4, 0xff36, 0xffb9, 0xffda, 0xffdc, 0xffbb,
+ 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe485, 0xfdaa, 0xfffc, 0xffbc,
+ 0xffdf, 0xf79e, 0xff59, 0xff59, 0xff56, 0xfeb4, 0xf508, 0xdc45, 0xfc21, 0xfc62,
+ 0xfc43, 0xfc43, 0xfc41, 0xfc21, 0xe404, 0xec45, 0xfed4, 0xff55, 0xff79, 0xff7a,
+ 0xff9a, 0xff9a, 0xff77, 0xff15, 0xfd6a, 0xec46, 0xfc01, 0xfc42, 0xfc62, 0xfc62,
+ 0xfc82, 0xf462, 0xe4a5, 0xcbe2, 0xedcd, 0xff73, 0xff98, 0xff98, 0xf79a, 0xf79a,
+ 0xf799, 0xff9a, 0xff98, 0xff98, 0xf79d, 0xf79c, 0xff79, 0xff59, 0xff57, 0xff57,
+ 0xff79, 0xff7a, 0xffdd, 0xffde, 0xefff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdf, 0xffdf,
+ 0xf7ff, 0xf7ff, 0xffde, 0xffbd, 0xff99, 0xff79, 0xfe72, 0xed8f, 0xcc28, 0xc428,
+ 0xc427, 0xc407, 0xbc28, 0xbc28, 0xb44a, 0xb46a, 0xd570, 0xf674, 0xff58, 0xff99,
+ 0xff57, 0xfe95, 0xee12, 0xf633, 0xee12, 0xee12, 0xf612, 0xf633, 0xf632, 0xf633,
+ 0xf633, 0xf633, 0xffbb, 0xffdb, 0xffb9, 0xff58, 0xf631, 0xc4ec, 0xc428, 0xc428,
+ 0xcc48, 0xc428, 0xbc28, 0xbc08, 0xbc6b, 0xd54f, 0xf695, 0xff99, 0xfffd, 0xfffd,
+ 0xfffd, 0xfffd, 0xffdc, 0xffdb, 0xff99, 0xff99, 0xe5d1, 0xc4cd, 0xb3e8, 0xbc08,
+ 0xcc27, 0xcc27, 0xcc06, 0xcc06, 0xb48c, 0xd5b0, 0xf717, 0xffba, 0xffdc, 0xffbb,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xd487, 0xfdcc, 0xf79b, 0xffdc,
+ 0xff9a, 0xbd10, 0xd383, 0xec46, 0xf403, 0xf424, 0xf444, 0xfc64, 0xfc42, 0xfc42,
+ 0xfc44, 0xfc43, 0xfc43, 0xfc63, 0xfc64, 0xfc43, 0xf444, 0xf424, 0xe445, 0xec45,
+ 0xe445, 0xe445, 0xec45, 0xec45, 0xfc64, 0xfc64, 0xfc63, 0xfc63, 0xfc22, 0xfc01,
+ 0xfc02, 0xfc22, 0xfc23, 0xfc03, 0xfc05, 0xfbe4, 0xec26, 0xec26, 0xe427, 0xe427,
+ 0xec06, 0xec06, 0xf3e5, 0xf3e5, 0xe426, 0xe446, 0xec25, 0xec25, 0xec24, 0xec24,
+ 0xd446, 0xd426, 0xa42b, 0xf6b4, 0xfffe, 0xfffe, 0xf7df, 0xf7ff, 0xffff, 0xfffe,
+ 0xffdd, 0xff9c, 0xff79, 0xff58, 0xfdcf, 0xcc28, 0xdba2, 0xf445, 0xfc43, 0xfc22,
+ 0xfc21, 0xfc42, 0xfc42, 0xf422, 0xf423, 0xf463, 0xec86, 0xdc04, 0xdc25, 0xfe0c,
+ 0xff10, 0xfd49, 0xd3c3, 0xec86, 0xec65, 0xe424, 0xe424, 0xe445, 0xe424, 0xe424,
+ 0xec85, 0xfd07, 0xff94, 0xfef2, 0xfd8b, 0xd446, 0xdbe3, 0xec65, 0xfc63, 0xfc43,
+ 0xfc22, 0xfc22, 0xfc43, 0xfc43, 0xec44, 0xec24, 0xdc04, 0xdc04, 0xf717, 0xfffb,
+ 0xffb8, 0xff97, 0xffb5, 0xfeb1, 0xf4c7, 0xe465, 0xec23, 0xf443, 0xf423, 0xf402,
+ 0xec23, 0xf444, 0xf465, 0xec44, 0xf4c6, 0xdbe2, 0xcc05, 0xfd4b, 0xfef4, 0xff97,
+ 0xffdb, 0xfffb, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xd467, 0xfdac, 0xef5a, 0xffbb,
+ 0xff59, 0xbcef, 0xd383, 0xec46, 0xf424, 0xf444, 0xfc44, 0xfc64, 0xfc42, 0xfc42,
+ 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xfc43, 0xfc23, 0xec24, 0xec04, 0xe425, 0xe425,
+ 0xe425, 0xe425, 0xec04, 0xec24, 0xf423, 0xf443, 0xfc22, 0xfc22, 0xfc22, 0xfc22,
+ 0xfc02, 0xfc02, 0xfc23, 0xfc23, 0xfc25, 0xfc05, 0xec06, 0xec06, 0xe407, 0xe407,
+ 0xebe6, 0xebe6, 0xf3c5, 0xf3c5, 0xdc05, 0xdc25, 0xec05, 0xec05, 0xec04, 0xe404,
+ 0xd426, 0xcc06, 0xac4b, 0xf694, 0xffde, 0xfffe, 0xf7df, 0xf7ff, 0xffff, 0xf7de,
+ 0xff7b, 0xffdd, 0xff38, 0xd591, 0xcc28, 0xd469, 0xf465, 0xec45, 0xfc22, 0xfc22,
+ 0xfc21, 0xfc42, 0xfc63, 0xfc42, 0xf463, 0xfc84, 0xdc25, 0xdc04, 0xd404, 0xf4e8,
+ 0xfdcb, 0xf4c7, 0xd3e3, 0xe465, 0xec65, 0xe445, 0xe445, 0xec65, 0xec65, 0xe444,
+ 0xf4a6, 0xfd28, 0xfef2, 0xf5ad, 0xdc67, 0xd446, 0xec65, 0xec65, 0xfc43, 0xfc43,
+ 0xfc22, 0xfc22, 0xfc43, 0xfc43, 0xec45, 0xec44, 0xe445, 0xe425, 0xbd30, 0xde74,
+ 0xff77, 0xffb8, 0xfeb1, 0xed6c, 0xec86, 0xec66, 0xf423, 0xf443, 0xfc43, 0xfc43,
+ 0xf444, 0xf464, 0xf444, 0xec24, 0xec64, 0xe403, 0xd426, 0xe4a8, 0xe5cf, 0xfef4,
+ 0xffdb, 0xffdb, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xdca9, 0xfded, 0xf7bb, 0xfffc,
+ 0xff97, 0xd52e, 0xfba0, 0xfc62, 0xfc00, 0xfc00, 0xfc43, 0xfc43, 0xfc43, 0xfc43,
+ 0xfc64, 0xfc64, 0xfc43, 0xfc63, 0xfc63, 0xfc63, 0xfc41, 0xfc41, 0xfc41, 0xfc41,
+ 0xfc41, 0xfc41, 0xfc41, 0xfc41, 0xfc63, 0xfc63, 0xfc83, 0xfc63, 0xfc42, 0xfc42,
+ 0xfc22, 0xfc22, 0xfc23, 0xfc23, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc62, 0xfc62,
+ 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc40, 0xfc40, 0xfc20, 0xfc21, 0xfc41, 0xfc21,
+ 0xfc62, 0xfc62, 0xcc68, 0xfe90, 0xff9c, 0xffdd, 0xffdf, 0xf7bf, 0xfffd, 0xf7dd,
+ 0xff97, 0xfe93, 0xed0b, 0xd427, 0xebe4, 0xf465, 0xfc42, 0xfc22, 0xfc20, 0xfc00,
+ 0xfc00, 0xfc00, 0xfc20, 0xfc00, 0xfc20, 0xfc40, 0xfc41, 0xfc82, 0xfc41, 0xfc41,
+ 0xfcc3, 0xfca3, 0xfc41, 0xfc82, 0xfc41, 0xfc20, 0xfc21, 0xfc61, 0xfc41, 0xfc40,
+ 0xfc61, 0xfcc3, 0xfd48, 0xec64, 0xe3e1, 0xf443, 0xfc83, 0xfc42, 0xfc00, 0xfc41,
+ 0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc21, 0xfc21, 0xbbe6, 0xd4a9,
+ 0xfe6f, 0xfed0, 0xfd07, 0xe403, 0xfc21, 0xfc41, 0xfc20, 0xfc20, 0xfc20, 0xfc41,
+ 0xfc62, 0xfc42, 0xfc43, 0xfc43, 0xfc00, 0xfc00, 0xfc42, 0xf401, 0xd468, 0xfe0e,
+ 0xffda, 0xffba, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xd488, 0xfdad, 0xef7a, 0xffdc,
+ 0xff77, 0xd50e, 0xfb80, 0xfc41, 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc43, 0xfc43,
+ 0xfc44, 0xfc44, 0xfc22, 0xfc23, 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc20, 0xfc20,
+ 0xfc41, 0xfc41, 0xfc41, 0xfc41, 0xfc62, 0xfc62, 0xfc63, 0xfc43, 0xfc62, 0xfc42,
+ 0xfc22, 0xfc22, 0xfc23, 0xfc23, 0xfc43, 0xfc23, 0xfc63, 0xfc63, 0xfc62, 0xfc62,
+ 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc40, 0xfc40, 0xfc20, 0xfc20, 0xfc21, 0xfc20,
+ 0xfc62, 0xfc62, 0xcca8, 0xfe90, 0xffbc, 0xffdd, 0xffdf, 0xf79f, 0xfffd, 0xffdd,
+ 0xff35, 0xe52d, 0xc3c6, 0xd448, 0xfca6, 0xf465, 0xfc01, 0xfc62, 0xfc20, 0xfc00,
+ 0xfc20, 0xfc20, 0xfc40, 0xfc40, 0xfc40, 0xfc40, 0xfc41, 0xfca3, 0xfc62, 0xfc00,
+ 0xfc41, 0xfc61, 0xfc41, 0xfc62, 0xfc62, 0xfc61, 0xfc61, 0xfc82, 0xfc61, 0xfc41,
+ 0xfc41, 0xfc82, 0xe444, 0xec64, 0xf484, 0xf484, 0xfc42, 0xfc42, 0xfc41, 0xfc41,
+ 0xfc20, 0xfc20, 0xfc20, 0xfc20, 0xfc21, 0xfc21, 0xfc41, 0xfc41, 0xcc89, 0xcc68,
+ 0xf50a, 0xfd6b, 0xf485, 0xe403, 0xfc42, 0xfc42, 0xfc61, 0xfc41, 0xfc21, 0xfc41,
+ 0xfc62, 0xfc42, 0xfc43, 0xfc64, 0xfc40, 0xfc40, 0xfca4, 0xf401, 0xcbe6, 0xfd6c,
+ 0xff99, 0xffda, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xe467, 0xfd8b, 0xef7a, 0xffdc,
+ 0xff77, 0xcd2e, 0xfb80, 0xfc61, 0xfc21, 0xfc21, 0xfc45, 0xfc45, 0xfc43, 0xfc43,
+ 0xfc42, 0xfc42, 0xfc22, 0xfc22, 0xfc22, 0xfc22, 0xfc21, 0xfc21, 0xfc21, 0xfc41,
+ 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfca2, 0xf482,
+ 0xfc62, 0xfc62, 0xfc63, 0xfc63, 0xfc63, 0xfc63, 0xf482, 0xf482, 0xf4a1, 0xf4a1,
+ 0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc40, 0xfc40, 0xfc02, 0xfc02, 0xfc01, 0xfbe1,
+ 0xf441, 0xf441, 0xcc87, 0xfe8f, 0xffbc, 0xffdd, 0xffbf, 0xff9e, 0xfffc, 0xffdb,
+ 0xfca6, 0xfc85, 0xfc64, 0xf443, 0xfc22, 0xfc42, 0xfc42, 0xfc41, 0xfc20, 0xfc21,
+ 0xfc41, 0xfc42, 0xfc63, 0xfc83, 0xfc84, 0xfc63, 0xfc22, 0xfc62, 0xfc62, 0xfc42,
+ 0xfc42, 0xfc42, 0xfc22, 0xfc42, 0xfc62, 0xfc42, 0xfc42, 0xfc62, 0xfc42, 0xfc21,
+ 0xfc21, 0xfc41, 0xe465, 0xec85, 0xec85, 0xec44, 0xec03, 0xec24, 0xfc44, 0xfc64,
+ 0xfc63, 0xfc63, 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc41, 0xfc41, 0xfca4, 0xfc43,
+ 0xf423, 0xfc64, 0xfcc6, 0xf485, 0xec64, 0xec64, 0xf485, 0xf465, 0xfc64, 0xfc84,
+ 0xfc63, 0xfc43, 0xfc22, 0xfc22, 0xfc40, 0xfc20, 0xfc62, 0xfc22, 0xcc06, 0xe4c9,
+ 0xf6d6, 0xffba, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xe487, 0xfdcc, 0xf7bb, 0xfffc,
+ 0xffb8, 0xcd4e, 0xfba0, 0xfc62, 0xfc00, 0xfc00, 0xfc45, 0xfc45, 0xfc43, 0xfc43,
+ 0xfc42, 0xfc42, 0xfc43, 0xfc43, 0xfc63, 0xfc43, 0xfc42, 0xfc42, 0xfc42, 0xfc42,
+ 0xfc42, 0xfc42, 0xfc42, 0xfc42, 0xfc62, 0xfc62, 0xfc63, 0xfc62, 0xf4a2, 0xf482,
+ 0xfc62, 0xfc62, 0xfc63, 0xfc63, 0xfc63, 0xfc42, 0xf482, 0xf482, 0xf4a1, 0xf4a1,
+ 0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc60, 0xfc60, 0xfc22, 0xfc42, 0xfc22, 0xfc22,
+ 0xfc62, 0xfc61, 0xcc46, 0xfe8f, 0xff9c, 0xffbc, 0xffdf, 0xffdf, 0xffdb, 0xef38,
+ 0xe3e3, 0xec44, 0xfc64, 0xf443, 0xfc22, 0xfc63, 0xfc42, 0xfc21, 0xfc41, 0xfc41,
+ 0xfc21, 0xfc21, 0xfc22, 0xfc42, 0xfc23, 0xf402, 0xfc42, 0xfc21, 0xfc22, 0xfc83,
+ 0xfc83, 0xfc42, 0xfc42, 0xfc63, 0xfc21, 0xfc21, 0xfc41, 0xfc42, 0xfc42, 0xfc41,
+ 0xfc41, 0xfc21, 0xf4c6, 0xe444, 0xdc03, 0xe424, 0xec44, 0xec24, 0xf403, 0xf423,
+ 0xfc43, 0xfc43, 0xfc42, 0xfc42, 0xfc41, 0xfc21, 0xfc21, 0xfc21, 0xfc84, 0xfc84,
+ 0xf443, 0xf443, 0xf485, 0xec64, 0xdc03, 0xdbe3, 0xe3e3, 0xe423, 0xf423, 0xf443,
+ 0xfc22, 0xfc43, 0xfc22, 0xfc22, 0xfc40, 0xfc00, 0xfc42, 0xfc62, 0xdc68, 0xdc67,
+ 0xddf2, 0xffb9, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xec24, 0xfd69, 0xef7a, 0xf7bb,
+ 0xff78, 0xb50e, 0xe360, 0xfc22, 0xfc03, 0xfc03, 0xf447, 0xf467, 0xfc43, 0xfc44,
+ 0xfc40, 0xfc40, 0xfc01, 0xfc01, 0xfc22, 0xfc02, 0xfc03, 0xf403, 0xec04, 0xec24,
+ 0xec04, 0xec03, 0xf402, 0xf403, 0xfc01, 0xfc22, 0xfc21, 0xfc01, 0xfc42, 0xfc42,
+ 0xfc03, 0xfc23, 0xfc04, 0xfc04, 0xfc03, 0xfbe3, 0xf3e1, 0xfbe1, 0xf401, 0xf401,
+ 0xf3e3, 0xf3e3, 0xfbe4, 0xfbe4, 0xec84, 0xec84, 0xf447, 0xf447, 0xfc25, 0xfc25,
+ 0xec63, 0xe462, 0xc487, 0xfeb0, 0xff9c, 0xff9c, 0xffde, 0xffde, 0xff37, 0xcdd1,
+ 0xfc21, 0xfbc0, 0xfbc0, 0xfc41, 0xfc82, 0xfc41, 0xfc22, 0xfc42, 0xfc84, 0xfc64,
+ 0xec45, 0xec24, 0xe466, 0xeca8, 0xecc9, 0xeca9, 0xfc65, 0xf3c3, 0xf3c2, 0xfc24,
+ 0xfc24, 0xfc03, 0xfc03, 0xfc23, 0xfc23, 0xfc24, 0xfc44, 0xfc44, 0xfc44, 0xfc44,
+ 0xfc44, 0xfc23, 0xdcc8, 0xe4c8, 0xed0a, 0xfd8c, 0xfdcd, 0xfdad, 0xf50a, 0xeca9,
+ 0xec67, 0xec67, 0xf466, 0xf466, 0xfc64, 0xfc64, 0xfc43, 0xfc42, 0xfc20, 0xfc00,
+ 0xfc83, 0xfc42, 0xd425, 0xe4a7, 0xe56d, 0xdd4c, 0xe54d, 0xe56d, 0xece9, 0xdc67,
+ 0xf3e3, 0xfc44, 0xfc42, 0xfc21, 0xfc40, 0xfc40, 0xfc43, 0xfc84, 0xdcca, 0xc428,
+ 0xc571, 0xffdb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc03, 0xfc23,
+ 0xfc20, 0xfc20, 0xfc43, 0xfc43, 0xfc22, 0xfc22, 0xec45, 0xfd8a, 0xf79b, 0xfffc,
+ 0xffb9, 0xbd2f, 0xe380, 0xfc22, 0xfbe2, 0xfc02, 0xec46, 0xf467, 0xfc44, 0xfc44,
+ 0xfc40, 0xfc40, 0xfc22, 0xfc42, 0xfc43, 0xfc43, 0xfc24, 0xfc23, 0xf424, 0xf424,
+ 0xf424, 0xec24, 0xf423, 0xfc43, 0xfc42, 0xfc63, 0xfc62, 0xfc42, 0xfc42, 0xfc41,
+ 0xfc23, 0xfc23, 0xfc04, 0xfbe4, 0xfbe3, 0xfbc3, 0xfc22, 0xfc22, 0xfc42, 0xfc42,
+ 0xfc23, 0xfc23, 0xfc04, 0xfc04, 0xeca5, 0xeca5, 0xfc67, 0xfc67, 0xfc46, 0xfc45,
+ 0xec83, 0xec83, 0xcca8, 0xfed0, 0xff9c, 0xff9c, 0xffde, 0xffbe, 0xe694, 0xa48c,
+ 0xfc00, 0xfc20, 0xfc41, 0xfc62, 0xfc41, 0xfc01, 0xfc22, 0xfc63, 0xfc43, 0xf423,
+ 0xec24, 0xec45, 0xf4c8, 0xfd8b, 0xfe0e, 0xfe0e, 0xfd28, 0xfc65, 0xfc24, 0xfc44,
+ 0xfc44, 0xfc44, 0xfc65, 0xfc44, 0xfc44, 0xfc44, 0xfc44, 0xfc44, 0xfc44, 0xfc44,
+ 0xfc23, 0xfc03, 0xd446, 0xfdcc, 0xff12, 0xff73, 0xff74, 0xff74, 0xfe90, 0xfd6c,
+ 0xe446, 0xe467, 0xf466, 0xf466, 0xfc84, 0xfc64, 0xfc63, 0xfc63, 0xfc40, 0xfba0,
+ 0xfc83, 0xfc62, 0xcbc4, 0xfd6a, 0xffb6, 0xff75, 0xffb6, 0xff75, 0xfe2e, 0xe4a8,
+ 0xf3c2, 0xfc23, 0xfc41, 0xfc01, 0xfc00, 0xfc60, 0xfc43, 0xfc43, 0xd4aa, 0xbbe7,
+ 0xbd10, 0xffdb, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xdc46, 0xfd6a, 0xf7fe, 0xefbe,
+ 0xffde, 0xf79d, 0xf7bd, 0xf79d, 0xff99, 0xeed7, 0xfce5, 0xec22, 0xfc22, 0xfc63,
+ 0xfc23, 0xfc22, 0xfc22, 0xfc21, 0xe424, 0xec65, 0xe754, 0xf7b5, 0xff5b, 0xff7b,
+ 0xff9d, 0xff7d, 0xf7f9, 0xef98, 0xfd6b, 0xe447, 0xfc00, 0xfc41, 0xfc41, 0xfc62,
+ 0xfc62, 0xfc42, 0xfc45, 0xe382, 0xd5ef, 0xff95, 0xffbc, 0xf79b, 0xff9d, 0xff9d,
+ 0xe7f8, 0xeff9, 0xfe30, 0xd449, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc41, 0xfc21,
+ 0xfc42, 0xfc22, 0xc449, 0xfe91, 0xfffd, 0xffdc, 0xffda, 0xff79, 0xfdac, 0xdc67,
+ 0xfc02, 0xfc02, 0xfbe1, 0xfc02, 0xfc02, 0xfc02, 0xfc44, 0xfc24, 0xdcea, 0xc427,
+ 0xc58f, 0xff77, 0xfffc, 0xffdb, 0xffdc, 0xfffd, 0xff99, 0xff79, 0xfed4, 0xd52e,
+ 0xd426, 0xe4a8, 0xfc64, 0xfc85, 0xfc22, 0xfc42, 0xfc02, 0xfc01, 0xfc43, 0xfc23,
+ 0xfbe3, 0xfbe3, 0xd6b5, 0xf799, 0xffbd, 0xffde, 0xffbf, 0xff9e, 0xfffc, 0xffba,
+ 0xdd2a, 0xcca8, 0xfc02, 0xfc63, 0xfc21, 0xfc01, 0xfc62, 0xfc62, 0xfc41, 0xfc20,
+ 0xd447, 0xd447, 0xeef7, 0xffbb, 0xfffe, 0xffdd, 0xfffe, 0xffbc, 0xfffb, 0xf739,
+ 0xe4ea, 0xcc27, 0xfc20, 0xfc40, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe487, 0xfdab, 0xffff, 0xf7ff,
+ 0xfffe, 0xffde, 0xfffe, 0xfffe, 0xffda, 0xeef7, 0xfd06, 0xec22, 0xfc22, 0xfc63,
+ 0xfc23, 0xfc22, 0xfc42, 0xfc22, 0xe445, 0xec86, 0xef95, 0xfff6, 0xff9c, 0xffdd,
+ 0xffbe, 0xff9e, 0xfffa, 0xf7b9, 0xfd8c, 0xe467, 0xfc21, 0xfc62, 0xfc41, 0xfc62,
+ 0xfc62, 0xfc42, 0xfc45, 0xe382, 0xd60f, 0xff95, 0xfffd, 0xffdc, 0xffde, 0xffde,
+ 0xeff9, 0xf7fa, 0xfe71, 0xdc8a, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc41, 0xfc21,
+ 0xfc42, 0xfc22, 0xc449, 0xfe71, 0xfffc, 0xffdc, 0xffba, 0xff17, 0xfd4a, 0xdc67,
+ 0xfc02, 0xfc02, 0xfbe1, 0xfc02, 0xfc02, 0xfc02, 0xfc24, 0xfc24, 0xcc48, 0xed6c,
+ 0xf736, 0xffb8, 0xffdb, 0xfffb, 0xfffd, 0xf7bc, 0xfffb, 0xffba, 0xff56, 0xf632,
+ 0xed0a, 0xdc88, 0xebe2, 0xfc43, 0xfc22, 0xfc42, 0xfc22, 0xfc01, 0xfc44, 0xfc23,
+ 0xfbe3, 0xfc03, 0xe717, 0xffda, 0xffbd, 0xffde, 0xffbf, 0xff9e, 0xfffc, 0xffbb,
+ 0xedac, 0xd4e9, 0xfc02, 0xfc43, 0xfc21, 0xfc01, 0xfc62, 0xfc62, 0xfc40, 0xfc00,
+ 0xd468, 0xdcc9, 0xf718, 0xffbb, 0xfffd, 0xfffd, 0xfffd, 0xffbc, 0xfffc, 0xff59,
+ 0xed0a, 0xcc27, 0xfc20, 0xfc40, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe486, 0xfdab, 0xffff, 0xf7ff,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd26, 0xf423, 0xfc22, 0xfc62,
+ 0xfc23, 0xfc23, 0xfc22, 0xfc02, 0xe425, 0xec86, 0xef95, 0xfff7, 0xffbd, 0xffdd,
+ 0xffbf, 0xff9e, 0xfffb, 0xf7b9, 0xfd8c, 0xe467, 0xfc01, 0xfc42, 0xfc42, 0xfc62,
+ 0xfc42, 0xfc42, 0xfc45, 0xe382, 0xd60f, 0xff96, 0xfffd, 0xffbd, 0xffdf, 0xffde,
+ 0xeffa, 0xf7fa, 0xfe51, 0xdc6a, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc42, 0xfc42,
+ 0xfc43, 0xf423, 0xc469, 0xfe71, 0xffdc, 0xffdc, 0xff98, 0xf694, 0xf4c7, 0xec65,
+ 0xfc42, 0xfc42, 0xfc21, 0xfc42, 0xfc22, 0xfc22, 0xf445, 0xf424, 0xd4cb, 0xfeb2,
+ 0xfffb, 0xffda, 0xf7bd, 0xfffd, 0xffff, 0xf7be, 0xfffd, 0xf7bc, 0xffda, 0xff78,
+ 0xfe50, 0xe4ea, 0xe3c2, 0xfc65, 0xfc22, 0xfc42, 0xfc22, 0xfc21, 0xfc64, 0xfc23,
+ 0xf403, 0xfc04, 0xef99, 0xfffb, 0xffbe, 0xffbe, 0xffbf, 0xff9f, 0xfffc, 0xffdb,
+ 0xfe4f, 0xdd4b, 0xfc02, 0xfc23, 0xfc01, 0xfc01, 0xfc62, 0xfc83, 0xfc41, 0xfc00,
+ 0xdca9, 0xf56c, 0xff7a, 0xffbb, 0xfffe, 0xfffe, 0xfffe, 0xf7bd, 0xfffc, 0xff9a,
+ 0xf56c, 0xd468, 0xfc20, 0xfc20, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe466, 0xfd8b, 0xffff, 0xf7ff,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd26, 0xec22, 0xfc21, 0xfc62,
+ 0xfc23, 0xfc23, 0xfc02, 0xfc01, 0xe424, 0xec66, 0xef95, 0xfff7, 0xffbd, 0xffdd,
+ 0xffbf, 0xff9e, 0xfffb, 0xf7b9, 0xfd8c, 0xe447, 0xfbe1, 0xfc22, 0xfc42, 0xfc62,
+ 0xfc42, 0xfc42, 0xfc45, 0xe382, 0xd60f, 0xffb6, 0xffdd, 0xffbc, 0xffde, 0xffbe,
+ 0xeff9, 0xeffa, 0xfe51, 0xdc49, 0xfc21, 0xfc21, 0xfc42, 0xfc42, 0xfc42, 0xfc42,
+ 0xfc43, 0xf423, 0xcc8a, 0xfe92, 0xffdc, 0xfffc, 0xff98, 0xe633, 0xec65, 0xec86,
+ 0xfc62, 0xfc62, 0xfc42, 0xfc42, 0xfc22, 0xfc22, 0xf425, 0xec04, 0xfe51, 0xff55,
+ 0xffda, 0xffba, 0xfffe, 0xfffe, 0xf7be, 0xffff, 0xffdd, 0xf7bc, 0xffda, 0xffda,
+ 0xff13, 0xf56c, 0xebe2, 0xfc85, 0xfc22, 0xfc42, 0xfc22, 0xfc22, 0xfc64, 0xfc23,
+ 0xfc04, 0xfc24, 0xf7ba, 0xfffb, 0xff9d, 0xffbe, 0xffbf, 0xffbf, 0xfffc, 0xffdb,
+ 0xfed1, 0xe56b, 0xfc02, 0xfc02, 0xfc01, 0xfc01, 0xfc62, 0xfc62, 0xfc41, 0xfbe0,
+ 0xdcca, 0xfe2f, 0xffbb, 0xffdb, 0xffdd, 0xfffe, 0xffdd, 0xffdd, 0xfffc, 0xffbb,
+ 0xfdcd, 0xd489, 0xfc00, 0xfc20, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfdab, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd06, 0xec02, 0xfc02, 0xfc63,
+ 0xfc23, 0xfc23, 0xfc43, 0xfc22, 0xe425, 0xec86, 0xefb6, 0xfff8, 0xffde, 0xffde,
+ 0xffdf, 0xffbf, 0xfffb, 0xf7ba, 0xfd8c, 0xe468, 0xfc02, 0xfc42, 0xfc42, 0xfc62,
+ 0xfc42, 0xfc22, 0xfc45, 0xe382, 0xd610, 0xffb7, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+ 0xeffb, 0xf7fb, 0xfe52, 0xdc6a, 0xfc01, 0xfc21, 0xfc42, 0xfc62, 0xfc62, 0xfc42,
+ 0xf444, 0xf424, 0xccaa, 0xfe92, 0xffdb, 0xfffc, 0xff97, 0xe5d0, 0xec02, 0xfc85,
+ 0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc43, 0xfc42, 0xec46, 0xe425, 0xff56, 0xff77,
+ 0xff9b, 0xffbc, 0xffff, 0xffff, 0xefbf, 0xffff, 0xefff, 0xefff, 0xfffd, 0xffdc,
+ 0xff55, 0xf5af, 0xe3c2, 0xf444, 0xfc42, 0xfc42, 0xfc22, 0xfc42, 0xfc64, 0xfc43,
+ 0xf424, 0xfc65, 0xf7ba, 0xfffb, 0xff9e, 0xffdf, 0xffdf, 0xffbf, 0xfffd, 0xf7db,
+ 0xff12, 0xdd6c, 0xf402, 0xfc23, 0xfc22, 0xfc22, 0xfc63, 0xfc63, 0xfc41, 0xfbe0,
+ 0xdceb, 0xfe91, 0xffdc, 0xfffc, 0xf7de, 0xffff, 0xf7fe, 0xf7de, 0xfffd, 0xffbc,
+ 0xfe0f, 0xd4aa, 0xfc00, 0xfc21, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfdab, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd06, 0xec02, 0xfc02, 0xfc63,
+ 0xfc23, 0xfc23, 0xfc23, 0xfc02, 0xe425, 0xec66, 0xef96, 0xfff8, 0xffbd, 0xffbd,
+ 0xffbf, 0xff9f, 0xfffb, 0xf7b9, 0xfd8c, 0xe448, 0xfc01, 0xfc42, 0xfc42, 0xfc62,
+ 0xfc42, 0xfc22, 0xfc45, 0xe382, 0xd610, 0xffb7, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+ 0xf7fb, 0xf7fb, 0xfe52, 0xdc6a, 0xfc01, 0xfc21, 0xfc42, 0xfc62, 0xfc62, 0xfc42,
+ 0xf444, 0xf424, 0xc46a, 0xfe92, 0xffbb, 0xffdb, 0xff97, 0xe5b0, 0xe3c1, 0xfc84,
+ 0xfc82, 0xfc82, 0xfc62, 0xfc62, 0xfc43, 0xfc42, 0xec46, 0xe425, 0xff15, 0xff97,
+ 0xfffd, 0xffbc, 0xf7ff, 0xffff, 0xf7df, 0xf7bf, 0xefff, 0xefff, 0xffdc, 0xff9b,
+ 0xff55, 0xf5cf, 0xe3c2, 0xf444, 0xfc42, 0xfc42, 0xfc21, 0xfc22, 0xfc64, 0xfc23,
+ 0xf444, 0xfca6, 0xf79a, 0xfffb, 0xff9e, 0xffdf, 0xffdf, 0xffbf, 0xfffd, 0xffdc,
+ 0xff12, 0xdd6c, 0xf3e2, 0xfc23, 0xfc22, 0xfc22, 0xfc63, 0xfc63, 0xfc41, 0xfbe0,
+ 0xdceb, 0xfed2, 0xffbb, 0xfffc, 0xf7de, 0xfffe, 0xf7fe, 0xf7fe, 0xfffc, 0xffbb,
+ 0xfe30, 0xdcca, 0xfbe0, 0xfc41, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe466, 0xfd8b, 0xffff, 0xf7ff,
+ 0xffff, 0xffde, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd06, 0xec23, 0xfc22, 0xfc63,
+ 0xfc23, 0xfc23, 0xfc03, 0xfbe2, 0xdc05, 0xe446, 0xef96, 0xfff8, 0xff9e, 0xffbe,
+ 0xffbf, 0xff9f, 0xfffb, 0xef9a, 0xfd6c, 0xe448, 0xfbe2, 0xfc23, 0xfc42, 0xfc62,
+ 0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffde, 0xffbe, 0xffdf, 0xffdf,
+ 0xeffb, 0xeffb, 0xfe32, 0xd44b, 0xfc01, 0xfc21, 0xfc42, 0xfc43, 0xfc63, 0xfc43,
+ 0xf444, 0xec44, 0xbc6a, 0xfeb3, 0xffbb, 0xffdb, 0xff97, 0xe5d0, 0xe3a1, 0xfc84,
+ 0xfc62, 0xfc62, 0xfc42, 0xfc62, 0xfc43, 0xfc23, 0xec46, 0xe426, 0xff36, 0xff97,
+ 0xfffd, 0xfffd, 0xe7df, 0xf7ff, 0xefff, 0xe7df, 0xe7ff, 0xe7ff, 0xffbd, 0xffbd,
+ 0xff76, 0xf5d0, 0xe3c2, 0xfc85, 0xfc42, 0xfc42, 0xfc21, 0xfc42, 0xfc84, 0xfc43,
+ 0xf465, 0xfce7, 0xf7bb, 0xfffc, 0xffbe, 0xffff, 0xffff, 0xf7bf, 0xfffd, 0xfffd,
+ 0xff53, 0xdd8d, 0xf3e3, 0xfc24, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc42, 0xfbe0,
+ 0xdceb, 0xfed3, 0xf79b, 0xfffd, 0xf7ff, 0xf7fe, 0xf7ff, 0xf7ff, 0xffdd, 0xff9c,
+ 0xfe50, 0xdceb, 0xfbe0, 0xfc62, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfdab, 0xffff, 0xf7ff,
+ 0xffff, 0xfffe, 0xffff, 0xffff, 0xfffb, 0xf738, 0xfd06, 0xf423, 0xfc22, 0xfc63,
+ 0xfc23, 0xfc23, 0xfc23, 0xfc02, 0xdc05, 0xec67, 0xf7b7, 0xfff9, 0xffde, 0xffde,
+ 0xffbf, 0xff9f, 0xfffc, 0xf7ba, 0xfd8d, 0xe468, 0xfc02, 0xfc43, 0xfc42, 0xfc62,
+ 0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffde, 0xffde, 0xffdf, 0xffdf,
+ 0xeffb, 0xf7fb, 0xfe52, 0xd44b, 0xfc01, 0xfc21, 0xfc42, 0xfc43, 0xfc63, 0xfc43,
+ 0xf444, 0xec44, 0xbc6a, 0xfed3, 0xffdb, 0xffdb, 0xffb7, 0xedf0, 0xebc1, 0xfc84,
+ 0xfc62, 0xfc62, 0xfc42, 0xfc62, 0xfc43, 0xfc43, 0xec46, 0xe446, 0xffb8, 0xff56,
+ 0xfffd, 0xfffd, 0xe7df, 0xefff, 0xefff, 0xefff, 0xe7ff, 0xe7ff, 0xffdd, 0xfffe,
+ 0xff97, 0xed8f, 0xd361, 0xfc86, 0xfc42, 0xfc42, 0xfc21, 0xfc21, 0xfc84, 0xfc43,
+ 0xf485, 0xfd08, 0xf7db, 0xfffc, 0xffbe, 0xffff, 0xffdf, 0xf79f, 0xfffd, 0xfffd,
+ 0xff74, 0xddad, 0xf403, 0xfc24, 0xfc22, 0xfc22, 0xfc63, 0xfc84, 0xfc42, 0xfbe0,
+ 0xdceb, 0xfed2, 0xf75a, 0xfffd, 0xf7ff, 0xf7de, 0xffff, 0xffff, 0xffdc, 0xf79b,
+ 0xfe51, 0xdceb, 0xfbe0, 0xfc62, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc22, 0xfc43,
+ 0xfc23, 0xfc44, 0xfc23, 0xfc03, 0xdc26, 0xec87, 0xef97, 0xfff8, 0xffbe, 0xffde,
+ 0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe469, 0xfc02, 0xfc43, 0xfc42, 0xfc62,
+ 0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+ 0xeffc, 0xf7fc, 0xfe53, 0xd44b, 0xfc21, 0xfc21, 0xfc43, 0xfc63, 0xfc63, 0xfc43,
+ 0xf444, 0xf444, 0xbc49, 0xfef4, 0xffbb, 0xffdc, 0xffd9, 0xee52, 0xec44, 0xf485,
+ 0xfc43, 0xfc22, 0xfc02, 0xfc42, 0xfc43, 0xfc03, 0xec25, 0xec05, 0xfe71, 0xff34,
+ 0xffda, 0xfffb, 0xf7ff, 0xeffe, 0xe7df, 0xf7ff, 0xf7ff, 0xefbe, 0xffbb, 0xfffc,
+ 0xff14, 0xed4d, 0xec03, 0xf424, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+ 0xf485, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xfffd, 0xf7fd,
+ 0xff74, 0xd58d, 0xebe3, 0xf444, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc42, 0xf3c0,
+ 0xdceb, 0xfe92, 0xffbd, 0xfffd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+ 0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc22, 0xfc43,
+ 0xfc23, 0xfc44, 0xfc23, 0xfc03, 0xdc26, 0xec87, 0xef97, 0xfff8, 0xffbe, 0xffde,
+ 0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe469, 0xfc02, 0xfc43, 0xfc42, 0xfc62,
+ 0xfc43, 0xfc22, 0xfc46, 0xe383, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+ 0xeffc, 0xf7fc, 0xfe53, 0xd44b, 0xfc21, 0xfc21, 0xfc43, 0xfc63, 0xfc63, 0xfc43,
+ 0xf444, 0xf444, 0xc46a, 0xfef4, 0xffbb, 0xffbb, 0xffd8, 0xf673, 0xfca6, 0xf465,
+ 0xfc43, 0xfc42, 0xfc22, 0xfc22, 0xfc23, 0xfc02, 0xf425, 0xf425, 0xc4aa, 0xfed3,
+ 0xfffb, 0xffda, 0xe7bd, 0xf7ff, 0xf7ff, 0xe79e, 0xefbe, 0xf7ff, 0xfffc, 0xff9a,
+ 0xfe0f, 0xdcca, 0xf424, 0xfc85, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+ 0xf485, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xfffd, 0xf7fd,
+ 0xff74, 0xd58d, 0xebe3, 0xf444, 0xfc23, 0xfc22, 0xfc63, 0xfc63, 0xfc42, 0xf3c0,
+ 0xdceb, 0xfe92, 0xffbd, 0xfffd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+ 0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+ 0xfc23, 0xfc44, 0xfc23, 0xfc02, 0xdc26, 0xec87, 0xefb6, 0xfff8, 0xffbe, 0xffde,
+ 0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe468, 0xfc02, 0xfc43, 0xfc62, 0xfc62,
+ 0xfc42, 0xfc22, 0xfc46, 0xe3a3, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+ 0xeffc, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc42, 0xfc43, 0xfc63, 0xfc42, 0xfc42,
+ 0xfc43, 0xfc22, 0xc449, 0xfeb2, 0xffdc, 0xffdc, 0xffdb, 0xff58, 0xfdcd, 0xdc88,
+ 0xfc23, 0xfc23, 0xfc02, 0xfc23, 0xfc02, 0xfbe2, 0xfc24, 0xfc24, 0xcc06, 0xf56b,
+ 0xff34, 0xffd7, 0xffd9, 0xfff9, 0xfffa, 0xffb9, 0xffb9, 0xffd9, 0xff96, 0xfe51,
+ 0xecc8, 0xdc26, 0xf423, 0xfc84, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+ 0xf465, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xf7fd, 0xf7fd,
+ 0xff74, 0xd5ad, 0xebe2, 0xf444, 0xfc23, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xf3c0,
+ 0xdceb, 0xfe92, 0xffbd, 0xffdd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+ 0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8b, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffb, 0xf718, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+ 0xfc23, 0xfc44, 0xfc23, 0xfc02, 0xdc26, 0xec87, 0xefb6, 0xfff8, 0xffbe, 0xffde,
+ 0xffbf, 0xffbf, 0xfffc, 0xefba, 0xfd8d, 0xe468, 0xfc02, 0xfc43, 0xfc62, 0xfc62,
+ 0xfc42, 0xfc22, 0xfc46, 0xe3a3, 0xd610, 0xffb7, 0xffff, 0xffde, 0xffdf, 0xffdf,
+ 0xeffc, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc42, 0xfc43, 0xfc63, 0xfc42, 0xfc42,
+ 0xfc43, 0xfc22, 0xbc28, 0xfe91, 0xfffd, 0xfffd, 0xfffb, 0xffba, 0xfe2e, 0xcc26,
+ 0xfc03, 0xfc03, 0xfc02, 0xfc23, 0xfc22, 0xfbe2, 0xfc24, 0xfc44, 0xe4e9, 0xcc26,
+ 0xcd6d, 0xff14, 0xff98, 0xff57, 0xff99, 0xffda, 0xff98, 0xff57, 0xfe31, 0xd50c,
+ 0xdc46, 0xe467, 0xfc64, 0xfc64, 0xfc42, 0xfc62, 0xfc42, 0xfc22, 0xfc64, 0xfc43,
+ 0xf465, 0xfce7, 0xf7bb, 0xfffc, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xf7fd, 0xf7fd,
+ 0xff74, 0xd5ad, 0xebe2, 0xf444, 0xfc23, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xf3c0,
+ 0xdceb, 0xfe92, 0xffbd, 0xffdd, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffd, 0xffbc,
+ 0xfe30, 0xd4cb, 0xfc01, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+ 0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec87, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+ 0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8d, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+ 0xfc42, 0xfc41, 0xfc45, 0xe3a2, 0xd610, 0xffd6, 0xfffe, 0xffde, 0xffdf, 0xffdf,
+ 0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc43, 0xfc43, 0xfc63, 0xfc41, 0xfc41,
+ 0xfc21, 0xfc21, 0xcc48, 0xfe91, 0xfffe, 0xfffd, 0xf7dd, 0xf7dc, 0xfe93, 0xb46a,
+ 0xec03, 0xf423, 0xfc22, 0xfc63, 0xfc42, 0xfc21, 0xfc22, 0xfc43, 0xfca5, 0xec03,
+ 0xdc45, 0xece8, 0xf54b, 0xfe0d, 0xfe4f, 0xfdcd, 0xfdcb, 0xfd29, 0xec45, 0xe424,
+ 0xf443, 0xfc84, 0xfc83, 0xfc63, 0xfc42, 0xfc63, 0xfc43, 0xfc22, 0xfc64, 0xfc43,
+ 0xfc64, 0xfcc6, 0xffba, 0xfffb, 0xffdf, 0xffbe, 0xffdf, 0xffdf, 0xfffd, 0xf7fc,
+ 0xff74, 0xd5ac, 0xebe2, 0xfc43, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+ 0xdceb, 0xfe72, 0xffbc, 0xffdd, 0xffff, 0xf7ff, 0xf7ff, 0xf7df, 0xfffd, 0xffbc,
+ 0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+ 0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec87, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+ 0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8d, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+ 0xfc42, 0xfc41, 0xfc45, 0xe3a2, 0xd610, 0xffd6, 0xfffe, 0xffde, 0xffdf, 0xffdf,
+ 0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xfc22, 0xfc43, 0xfc43, 0xfc63, 0xfc41, 0xfc41,
+ 0xfc21, 0xfc21, 0xcc89, 0xfe91, 0xf7fd, 0xf7fd, 0xf7dc, 0xfffd, 0xff56, 0xe5f0,
+ 0xec23, 0xf423, 0xfc22, 0xfc63, 0xfc42, 0xfc22, 0xfc42, 0xfc43, 0xf464, 0xf464,
+ 0xdc86, 0xd445, 0xd426, 0xe4a8, 0xf4c9, 0xe488, 0xe486, 0xdc45, 0xe424, 0xec65,
+ 0xfc85, 0xfc84, 0xfc42, 0xfc63, 0xfc42, 0xfc63, 0xfc43, 0xfc22, 0xfc64, 0xfc43,
+ 0xfc64, 0xfcc6, 0xffba, 0xfffb, 0xffdf, 0xffbe, 0xffdf, 0xffdf, 0xfffd, 0xf7fc,
+ 0xff74, 0xd5ac, 0xebe2, 0xfc43, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+ 0xdceb, 0xfe72, 0xffbc, 0xffdd, 0xffff, 0xf7ff, 0xf7ff, 0xf7df, 0xfffd, 0xffbc,
+ 0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+ 0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec86, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+ 0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8c, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+ 0xfc61, 0xfc41, 0xfc65, 0xe3a2, 0xd630, 0xffd6, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+ 0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xf443, 0xfc43, 0xfc43, 0xfc63, 0xfc40, 0xfc20,
+ 0xfc20, 0xfc00, 0xcc68, 0xfe71, 0xf7fd, 0xf7fe, 0xf7ff, 0xf7ff, 0xfffa, 0xff99,
+ 0xeca5, 0xe484, 0xfc42, 0xfc63, 0xfc41, 0xfc41, 0xfc21, 0xfc41, 0xfc42, 0xfc62,
+ 0xfc43, 0xfc23, 0xfc03, 0xf3c2, 0xfba2, 0xfc04, 0xfc41, 0xfc41, 0xfc62, 0xfc82,
+ 0xfc82, 0xfc41, 0xfc21, 0xfc41, 0xfc42, 0xfc63, 0xfc43, 0xfc23, 0xfc44, 0xfc43,
+ 0xfc44, 0xfcc6, 0xffba, 0xffdb, 0xffde, 0xffbe, 0xffdf, 0xffdf, 0xfffc, 0xf7fc,
+ 0xff73, 0xd5ac, 0xf3e2, 0xfc23, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+ 0xdceb, 0xfe71, 0xffbc, 0xffdd, 0xffff, 0xffff, 0xffff, 0xffde, 0xfffd, 0xffbc,
+ 0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7ff, 0xfffe, 0xffdd, 0xffff, 0xffff, 0xffdb, 0xd5b2, 0xfc23, 0xfc23,
+ 0xfc20, 0xfc41, 0xfc63, 0xfc63, 0xfc42, 0xfc21, 0xe467, 0xfd8c, 0xffff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffc, 0xf719, 0xfd07, 0xec23, 0xfc02, 0xfc43,
+ 0xfc24, 0xfc44, 0xfc22, 0xfc22, 0xdc25, 0xec86, 0xefb6, 0xfff8, 0xffbd, 0xffde,
+ 0xffdf, 0xffbf, 0xfffb, 0xefda, 0xfd8c, 0xe468, 0xfc01, 0xfc42, 0xfc61, 0xfc61,
+ 0xfc61, 0xfc41, 0xfc65, 0xe3a2, 0xd630, 0xffd6, 0xfffe, 0xffdd, 0xffdf, 0xffdf,
+ 0xeffb, 0xeffc, 0xfe53, 0xd44b, 0xf443, 0xfc43, 0xfc43, 0xfc63, 0xfc40, 0xfc20,
+ 0xfc20, 0xfc00, 0xcc48, 0xfe91, 0xf7fe, 0xfffe, 0xf7ff, 0xefde, 0xffb9, 0xffba,
+ 0xf4e6, 0xeca5, 0xf442, 0xfc42, 0xfc21, 0xfc21, 0xfc21, 0xfc21, 0xfc01, 0xfc83,
+ 0xfc43, 0xf402, 0xfc44, 0xfc45, 0xfbe3, 0xfc24, 0xfc41, 0xfc61, 0xfc41, 0xfc21,
+ 0xfc21, 0xfc42, 0xfc62, 0xfc42, 0xfc42, 0xfc63, 0xfc43, 0xfc23, 0xfc44, 0xfc43,
+ 0xfc44, 0xfcc6, 0xffba, 0xffdb, 0xffde, 0xffbe, 0xffdf, 0xffdf, 0xfffc, 0xf7fc,
+ 0xff73, 0xd5ac, 0xf3e2, 0xfc23, 0xfc22, 0xfc02, 0xfc43, 0xfc63, 0xfc42, 0xfbc0,
+ 0xdceb, 0xfe71, 0xffbc, 0xffdd, 0xffff, 0xffff, 0xffff, 0xffde, 0xfffd, 0xffbc,
+ 0xfe30, 0xdccb, 0xfbe1, 0xfc42, 0xfc23, 0xfc64, 0xfc20, 0xfc21, 0xf484, 0xe3e2,
+ 0xc54f, 0xffd9, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xf7de, 0xf7ff, 0xf7ff, 0xffbd, 0xfffe, 0xffb8, 0xe58f, 0xf424, 0xf424,
+ 0xfc00, 0xfc00, 0xfc20, 0xfc20, 0xfc22, 0xfc22, 0xe469, 0xfdcf, 0xffbb, 0xffbb,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffda, 0xfef6, 0xfd08, 0xe404, 0xfc00, 0xfc41,
+ 0xfc20, 0xfc20, 0xfc20, 0xfc60, 0xdc23, 0xeca5, 0xff36, 0xffd8, 0xefbc, 0xf7fe,
+ 0xf7fe, 0xf7fe, 0xffda, 0xff58, 0xfd8b, 0xe446, 0xfbe1, 0xfc42, 0xfc20, 0xfc00,
+ 0xfc41, 0xfbe0, 0xec44, 0xe3e3, 0xe5d0, 0xff97, 0xf7fe, 0xf7de, 0xefff, 0xefff,
+ 0xfffc, 0xffdb, 0xfe51, 0xd4ab, 0xfc01, 0xfc21, 0xfc20, 0xfc20, 0xfc20, 0xfc00,
+ 0xf423, 0xf423, 0xccab, 0xfe72, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffdd, 0xffdd,
+ 0xff37, 0xe50e, 0xcba6, 0xe429, 0xfc86, 0xfc04, 0xfbc0, 0xfc62, 0xfc00, 0xfc00,
+ 0xfc20, 0xfc40, 0xfc62, 0xfc41, 0xf483, 0xfc83, 0xfc41, 0xfc00, 0xfc42, 0xfc83,
+ 0xfc42, 0xf3e1, 0xfc22, 0xfc63, 0xfc41, 0xfc41, 0xfc20, 0xfc20, 0xfc41, 0xfc00,
+ 0xfc42, 0xfd05, 0xffbb, 0xffdb, 0xffbe, 0xffde, 0xffbf, 0xff9f, 0xffbb, 0xffbb,
+ 0xff52, 0xedac, 0xe462, 0xe462, 0xfc21, 0xfc41, 0xfc23, 0xfc03, 0xf462, 0xe3e0,
+ 0xe4e8, 0xfe8f, 0xff99, 0xff99, 0xf7df, 0xffff, 0xffff, 0xf7bf, 0xff99, 0xff79,
+ 0xfe4e, 0xe4e8, 0xec21, 0xf462, 0xfc20, 0xfcc0, 0xfc61, 0xfc20, 0xe487, 0xdc26,
+ 0xdd2f, 0xff78, 0xffbd, 0xffbd, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xffde, 0xf7ff, 0xf7ff, 0xffde, 0xfffe, 0xffb8, 0xe590, 0xf424, 0xf424,
+ 0xfc00, 0xfc20, 0xfc21, 0xfc20, 0xfc22, 0xfc22, 0xdc49, 0xfd8e, 0xff9a, 0xffbb,
+ 0xefdf, 0xf7ff, 0xf7ff, 0xf7ff, 0xffda, 0xff17, 0xfd29, 0xec45, 0xfc21, 0xfc61,
+ 0xfc20, 0xfc00, 0xfc20, 0xfc40, 0xdc03, 0xec85, 0xff15, 0xffd8, 0xefbc, 0xf7fe,
+ 0xf7fe, 0xf7fe, 0xffb9, 0xff58, 0xfd6b, 0xe446, 0xfbe0, 0xfc22, 0xfc40, 0xfc20,
+ 0xfc82, 0xfc41, 0xf465, 0xe3e3, 0xddb0, 0xff77, 0xffff, 0xf7fe, 0xefff, 0xefdf,
+ 0xffbb, 0xffbb, 0xfe51, 0xcc8a, 0xfc21, 0xfc41, 0xfc20, 0xfc40, 0xfc20, 0xfc20,
+ 0xfc43, 0xf423, 0xc48a, 0xfe51, 0xffdc, 0xffbc, 0xf7bd, 0xfffe, 0xfffd, 0xfffe,
+ 0xff78, 0xfeb5, 0xfd0c, 0xd3c7, 0xf3a2, 0xfc65, 0xfc41, 0xfba0, 0xfc20, 0xfc00,
+ 0xfc40, 0xfc60, 0xfc82, 0xfc61, 0xf483, 0xfc83, 0xfc21, 0xfc41, 0xfc21, 0xfbe0,
+ 0xfc83, 0xfd05, 0xfca4, 0xf3c0, 0xfc21, 0xfc41, 0xfc00, 0xfc20, 0xfc41, 0xfc00,
+ 0xfc42, 0xfce5, 0xffbb, 0xffdb, 0xffde, 0xffde, 0xffbf, 0xffbf, 0xffbb, 0xffbb,
+ 0xff32, 0xe56b, 0xdc41, 0xe462, 0xfc41, 0xfc42, 0xfc02, 0xfbe2, 0xfca3, 0xec00,
+ 0xe4e9, 0xfe8f, 0xffba, 0xffba, 0xf7df, 0xf7bf, 0xffff, 0xf7df, 0xffda, 0xff9a,
+ 0xfe2e, 0xe4c8, 0xec21, 0xfc82, 0xfc40, 0xfc80, 0xfc41, 0xfc41, 0xe487, 0xcbc4,
+ 0xd4ee, 0xffb9, 0xffdd, 0xffdd, 0xffde, 0xffde, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffff, 0xffb9, 0xd591, 0xe426, 0xe446,
+ 0xfc23, 0xfc43, 0xfc44, 0xfc43, 0xec45, 0xec25, 0xcc6a, 0xfdd0, 0xffbc, 0xffdc,
+ 0xefdf, 0xf7ff, 0xf7ff, 0xf7ff, 0xffba, 0xfef7, 0xf50a, 0xd427, 0xf424, 0xfc85,
+ 0xfc63, 0xfc43, 0xfc84, 0xfc84, 0xcc26, 0xe4c8, 0xff37, 0xfffa, 0xefdd, 0xf7fe,
+ 0xf7fe, 0xf7fe, 0xffdb, 0xff79, 0xfdae, 0xdc89, 0xf425, 0xfc86, 0xfc64, 0xf443,
+ 0xfc85, 0xf444, 0xe467, 0xcba4, 0xd570, 0xff78, 0xffff, 0xf7fe, 0xefff, 0xefff,
+ 0xffdc, 0xfffd, 0xfe94, 0xc4ac, 0xe445, 0xec45, 0xfc43, 0xfc43, 0xfc43, 0xfc22,
+ 0xe446, 0xe445, 0xbc8c, 0xfe94, 0xfffe, 0xffdd, 0xf7be, 0xfffe, 0xfffe, 0xffdd,
+ 0xfffc, 0xf7fc, 0xf736, 0xcdf1, 0xcc89, 0xbc28, 0xe425, 0xec66, 0xfc64, 0xfc44,
+ 0xfc44, 0xfc44, 0xfc45, 0xfc24, 0xfc05, 0xfc25, 0xec87, 0xdc05, 0xcbe5, 0xf50a,
+ 0xfe0e, 0xf54b, 0xcc06, 0xd426, 0xe466, 0xe487, 0xe446, 0xe446, 0xe487, 0xdc67,
+ 0xdc88, 0xed0a, 0xf7dc, 0xffdd, 0xffdf, 0xffdf, 0xffdf, 0xffbf, 0xffdd, 0xffdd,
+ 0xff75, 0xd58e, 0xc445, 0xcc86, 0xec87, 0xec86, 0xf427, 0xf427, 0xdc66, 0xcbe4,
+ 0xcccb, 0xfe72, 0xffbc, 0xfffd, 0xf7ff, 0xefdf, 0xefdf, 0xf7ff, 0xfffd, 0xff9b,
+ 0xf610, 0xccab, 0xcc05, 0xdc87, 0xf484, 0xf464, 0xdc46, 0xe4a7, 0xccab, 0xb3c7,
+ 0xbcef, 0xffdb, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xf7ff, 0xffde, 0xffff, 0xff79, 0xd570, 0xe447, 0xec67,
+ 0xfc44, 0xfc64, 0xfc85, 0xfc64, 0xf466, 0xec66, 0xcc4a, 0xf5b0, 0xffdc, 0xffdc,
+ 0xf7ff, 0xf7ff, 0xefdf, 0xf7ff, 0xffdb, 0xff38, 0xf54b, 0xd447, 0xf424, 0xfc85,
+ 0xfc84, 0xfca4, 0xf463, 0xf443, 0xcc05, 0xe4c8, 0xf716, 0xfffa, 0xf7fe, 0xf7fe,
+ 0xf7fe, 0xf7fe, 0xffdb, 0xff59, 0xfd8d, 0xdc69, 0xec04, 0xf425, 0xfca5, 0xfc64,
+ 0xfc85, 0xf444, 0xe467, 0xd3e5, 0xe5d1, 0xffd9, 0xf7de, 0xf7fe, 0xf7ff, 0xefff,
+ 0xfffd, 0xfffd, 0xfe73, 0xbc6b, 0xec65, 0xec66, 0xfc64, 0xfc64, 0xfc63, 0xfc43,
+ 0xec66, 0xec66, 0xb44b, 0xfe74, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffdd, 0xffbd,
+ 0xf7fb, 0xf7db, 0xfff9, 0xffd9, 0xfe50, 0xccaa, 0xe425, 0xfcc8, 0xfc65, 0xfc44,
+ 0xfc23, 0xfc44, 0xfc24, 0xfc04, 0xfc04, 0xfc25, 0xec87, 0xdc25, 0xecc9, 0xfed1,
+ 0xff94, 0xed0a, 0xc3a4, 0xece9, 0xe446, 0xe467, 0xe446, 0xe446, 0xe487, 0xdc67,
+ 0xdc88, 0xecea, 0xf7dd, 0xfffd, 0xffdf, 0xffff, 0xffdf, 0xffbf, 0xffdd, 0xffdd,
+ 0xff75, 0xd58e, 0xc465, 0xcc86, 0xe466, 0xe466, 0xf427, 0xf448, 0xdc87, 0xd425,
+ 0xd4ec, 0xfe72, 0xff9b, 0xfffd, 0xf7ff, 0xf7ff, 0xefdf, 0xf7ff, 0xffdc, 0xff7b,
+ 0xf610, 0xd4ec, 0xdc66, 0xe4a7, 0xf464, 0xec43, 0xdc45, 0xe487, 0xccab, 0xb3e8,
+ 0xbd10, 0xff7a, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffbb, 0xd5f4, 0xc44a, 0xc44a,
+ 0xdc48, 0xdc68, 0xd469, 0xd469, 0xc46a, 0xc46a, 0xc4ae, 0xe5d3, 0xffbd, 0xfffe,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xff9b, 0xeed8, 0xdd4e, 0xbc4a, 0xc449, 0xcc69,
+ 0xcc69, 0xcc69, 0xd489, 0xd488, 0xbc6a, 0xd52d, 0xeef8, 0xffdc, 0xf7fe, 0xffff,
+ 0xf7ff, 0xf7df, 0xffdc, 0xff5a, 0xedd1, 0xcccd, 0xd46a, 0xd46a, 0xd4aa, 0xcc69,
+ 0xd469, 0xcc28, 0xcc6a, 0xbc09, 0xcd92, 0xff39, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff,
+ 0xffdd, 0xffdd, 0xee76, 0xb4cf, 0xbc4a, 0xbc6a, 0xcc49, 0xd449, 0xd448, 0xd448,
+ 0xc44a, 0xc44a, 0xb4af, 0xee96, 0xffde, 0xffde, 0xffde, 0xffff, 0xfffe, 0xffde,
+ 0xdfff, 0xe7ff, 0xf7ff, 0xeffe, 0xffdb, 0xf77a, 0xee53, 0xd570, 0xcc4a, 0xc409,
+ 0xcbc7, 0xcbc7, 0xd3a6, 0xd3a6, 0xdba6, 0xdbc7, 0xcd50, 0xee54, 0xff18, 0xffba,
+ 0xfffc, 0xeeb7, 0xcdb3, 0xe655, 0xe613, 0xe634, 0xe613, 0xe613, 0xe634, 0xde14,
+ 0xde35, 0xe696, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffff,
+ 0xfffb, 0xded6, 0xd631, 0xde52, 0xee11, 0xe5f1, 0xf5f3, 0xfe13, 0xee34, 0xe5f3,
+ 0xe676, 0xff5a, 0xffde, 0xfffe, 0xf7ff, 0xefff, 0xf7ff, 0xf7ff, 0xffff, 0xffde,
+ 0xff19, 0xe656, 0xe5f3, 0xee13, 0xf5f2, 0xf612, 0xe5f2, 0xee13, 0xe655, 0xd5f4,
+ 0xd6b8, 0xfffd, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xfffe, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffdc, 0xf6f8, 0xfe31, 0xfe52,
+ 0xfe2f, 0xfe50, 0xfe51, 0xfe51, 0xfe72, 0xfe52, 0xf634, 0xfeb6, 0xffdd, 0xffdd,
+ 0xefff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdc, 0xffbc, 0xfed4, 0xfe52, 0xfe51, 0xfe51,
+ 0xfe30, 0xfe30, 0xfe2f, 0xfe0e, 0xee11, 0xfe72, 0xff7a, 0xfffc, 0xffff, 0xf7fe,
+ 0xf7ff, 0xf7df, 0xfffd, 0xffbc, 0xfeb5, 0xf633, 0xfdf0, 0xfe10, 0xfe51, 0xfe30,
+ 0xfe71, 0xfe30, 0xfe93, 0xfe11, 0xfef7, 0xffdb, 0xffff, 0xf7de, 0xefff, 0xefdf,
+ 0xffbd, 0xffde, 0xff59, 0xee55, 0xfe52, 0xfe52, 0xfe30, 0xfe50, 0xfe2f, 0xfe2f,
+ 0xfe51, 0xfe31, 0xe655, 0xff59, 0xffff, 0xffde, 0xf7de, 0xffff, 0xffde, 0xfffe,
+ 0xe7ff, 0xe7ff, 0xeffe, 0xe7dd, 0xffdb, 0xfffc, 0xff98, 0xfed5, 0xfe52, 0xfe11,
+ 0xfdcf, 0xfdcf, 0xfdae, 0xfd8e, 0xfdae, 0xfdcf, 0xfed6, 0xffb9, 0xffdb, 0xff9a,
+ 0xffdc, 0xffdc, 0xff9a, 0xffba, 0xffba, 0xffda, 0xffd9, 0xffb9, 0xffdb, 0xffda,
+ 0xffbb, 0xffdc, 0xf7fe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7df, 0xffff, 0xffff,
+ 0xfffc, 0xffdb, 0xfff8, 0xfff9, 0xffb8, 0xff98, 0xff99, 0xff99, 0xff99, 0xff58,
+ 0xffbb, 0xffdc, 0xfffe, 0xffde, 0xf7ff, 0xefff, 0xefff, 0xefff, 0xfffe, 0xfffe,
+ 0xffbb, 0xffbb, 0xff79, 0xff79, 0xff98, 0xffb9, 0xffb9, 0xff99, 0xffdb, 0xffdb,
+ 0xfffd, 0xfffd, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xf7df, 0xffff, 0xffff, 0xffff, 0xfffe, 0xffbd, 0xff9b, 0xff9b,
+ 0xff99, 0xff99, 0xffbb, 0xffbb, 0xffdc, 0xffdc, 0xffdd, 0xffbc, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdd, 0xfffe, 0xffbb, 0xff9a, 0xffdb, 0xffdb,
+ 0xffbb, 0xffbc, 0xffba, 0xffba, 0xffbb, 0xffbb, 0xfffe, 0xfffe, 0xffff, 0xf7df,
+ 0xf7ff, 0xf7df, 0xfffe, 0xfffe, 0xffbc, 0xff9c, 0xffbb, 0xffbb, 0xff9b, 0xffbb,
+ 0xffbb, 0xff7a, 0xffdb, 0xff7a, 0xffbc, 0xffdc, 0xffff, 0xf7be, 0xf7df, 0xf7ff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffbc, 0xffdd, 0xffbb, 0xffbb, 0xffba, 0xffba,
+ 0xffbb, 0xffbb, 0xffdd, 0xfffe, 0xffff, 0xf7ff, 0xffff, 0xffff, 0xffde, 0xffff,
+ 0xffff, 0xffbf, 0xffbf, 0xffdf, 0xffdf, 0xf79e, 0xffbd, 0xfffe, 0xfffd, 0xffdc,
+ 0xffbb, 0xffdb, 0xffda, 0xffb9, 0xffd9, 0xfff9, 0xfffe, 0xffde, 0xf7be, 0xf7df,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xf7fe, 0xf7fe, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffe, 0xffde, 0xffdf, 0xffde,
+ 0xffff, 0xffff, 0xf7fe, 0xf7de, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7ff,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffdf, 0xffdf,
+ 0xffff, 0xffff, 0xf7df, 0xffff, 0xffff, 0xf7df, 0xfffe, 0xfffe, 0xffbb, 0xffbb,
+ 0xffba, 0xffba, 0xffdb, 0xffdb, 0xffdc, 0xfffc, 0xffdd, 0xff9c, 0xffff, 0xffde,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffdd, 0xfffe, 0xfffc, 0xffbb, 0xffdb, 0xffdb,
+ 0xffbc, 0xfffc, 0xff9a, 0xffba, 0xffdc, 0xffdc, 0xfffe, 0xffdd, 0xf7df, 0xf7df,
+ 0xffff, 0xf7df, 0xffde, 0xfffe, 0xffbc, 0xffbc, 0xffbb, 0xff9b, 0xfffc, 0xfffc,
+ 0xffdb, 0xff7a, 0xffdb, 0xffbb, 0xffdc, 0xff9c, 0xffff, 0xf7de, 0xf7ff, 0xffff,
+ 0xffdf, 0xf7be, 0xffde, 0xfffe, 0xfffd, 0xfffd, 0xffdc, 0xffdc, 0xffbb, 0xffbb,
+ 0xffdb, 0xffdb, 0xfffe, 0xffdd, 0xf7bf, 0xf7df, 0xffff, 0xffff, 0xffde, 0xffff,
+ 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffdf, 0xffbd, 0xffde, 0xffdc, 0xffdc,
+ 0xffdb, 0xfffc, 0xfffa, 0xfffa, 0xffd9, 0xfffa, 0xffbd, 0xffbd, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xf7ff, 0xfffe, 0xffff, 0xfffe, 0xffde, 0xffff, 0xffff,
+ 0xf7df, 0xf7ff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xf7fe, 0xf7fe, 0xfffd, 0xfffc, 0xffbc, 0xfffd, 0xfffe, 0xffdd, 0xffdf, 0xffbe,
+ 0xffff, 0xffff, 0xf7de, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xf7fe,
+ 0xffde, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffbf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xf7df, 0xf7df, 0xffdf, 0xffdf,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xf7df, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffbe, 0xffbe, 0xffbe, 0xffbe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffdf, 0xffdf,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffde, 0xffdf, 0xf7df, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffbe, 0xffbe, 0xffbe, 0xffbe, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffdf, 0xffdf, 0xffdf, 0xffdf,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffdf, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xefff, 0xefff, 0xf7ff, 0xf7ff, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde, 0xffde,
+ 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xf7fe, 0xf7fe,
+ 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffd, 0xfffd,
+ 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffdd, 0xffdd, 0xffde, 0xffde, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd,
+ 0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xefff, 0xefff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffde, 0xffde, 0xffde, 0xffde, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xffdd, 0xffdd, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xffff, 0xffff,
+ 0xffde, 0xffde, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffd, 0xfffd,
+ 0xfffe, 0xfffe, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xffff, 0xffff, 0xffdd, 0xffdd, 0xfffe, 0xfffe, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xfffd, 0xfffd, 0xfffe, 0xfffe, 0xf7ff, 0xf7ff,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffde, 0xffde, 0xfffe, 0xfffe,
+ 0xfffe, 0xfffe, 0xf7fe, 0xf7fe, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xf7ff, 0xf7ff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xf7ff, 0xfffe, 0xfffe, 0xfffe, 0xfffe,
+ 0xffdf, 0xffdf, 0xffff, 0xffff, 0xffff, 0xffff, 0xf7fe, 0xf7fe, 0xf7fd, 0xf7fd,
+ 0xf7fe, 0xf7fe, 0xffff, 0xffff, 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xfffe, 0xfffe, 0xf7fd, 0xf7fd, 0xfffe, 0xfffe, 0xffde, 0xffde,
+ 0xffdf, 0xffdf, 0xffdf, 0xffdf, 0xffff, 0xffff, 0xf7ff, 0xf7ff, 0xefff, 0xefff,
+};
+
+const UWORD32 gau4_ihevcd_logo_rgb8888[10240] = {
+
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xf8fce8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xf8fce8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xe8fcf8, 0xe8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xf8fce8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8f8e0, 0xf8f8e0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xe8fcf8, 0xe8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8f4f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8,
+ 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f4e8, 0xf8f4e8, 0xf8f4e8, 0xf8f4e8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8ecc0, 0xf8e0b0, 0xf8cc98, 0xf0c090, 0xf0bc90, 0xf8cc98, 0xf8e0b8, 0xf8ecc8, 0xf8f4d8, 0xf8f4e0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8dcb0, 0xe8c090, 0xc89868, 0xb07c50, 0xb07c50, 0xc89868, 0xe8c098, 0xf8dcb0, 0xf8f4d8, 0xf8f4d8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf0f4e8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fce0, 0xf8fce0, 0xf8f4c8, 0xf8f4c8, 0xf8f0c0, 0xf8f4c0, 0xf8f4c8, 0xf8f4c8, 0xf8f8d0, 0xf8f8d0,
+ 0xf8f8d8, 0xf8f4d8, 0xf8fce8, 0xf8f4e0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xe8f8f8, 0xe8f8f8, 0xe0fcf8, 0xe0fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xe8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xe8f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8ecd8, 0xf8f8e0, 0xf8f0d0, 0xf8dcb8,
+ 0xd87810, 0xe08018, 0xf08418, 0xf08c20, 0xf08c20, 0xe88818, 0xd08428, 0xc87c28, 0xf0c890, 0xf8e8b0, 0xf8fce0, 0xf8f8d8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fce0, 0xf0e8d0, 0xf8e0b8, 0xf8e0b8, 0xf8e0b0, 0xf8e4b0, 0xf8e4b8, 0xf8e4b8, 0xf8e4c0, 0xf8e4c0,
+ 0xf8e0c0, 0xf8ecd0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xe0fcf8, 0xe0fcf8,
+ 0xe8fcf8, 0xe8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xe8fcf8, 0xe8fcf8,
+ 0xe8f8f8, 0xe8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf0f8f8, 0xf0f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8e0, 0xf8f8e8, 0xf8d4b0, 0xc8a080,
+ 0xe08018, 0xe08420, 0xf08818, 0xf88c20, 0xf09020, 0xf08820, 0xd88830, 0xd08428, 0xc09460, 0xf8d098, 0xf8fce0, 0xf8fce0, 0xf0fcf0, 0xf0f8f0, 0xf0fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f0c8, 0xe0cca0, 0xe8a860, 0xe8a860, 0xf8a848, 0xf8ac50, 0xf8ac58, 0xf8ac50, 0xe8ac60, 0xe0ac60,
+ 0xd8a470, 0xf8c490, 0xf8f8d8, 0xf8f8d8, 0xf8f8f0, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8fce8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xe8fcf8,
+ 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf8fcf0, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8f8f0, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8f8d8, 0xf8f8d8, 0xf8bc88, 0xb07840,
+ 0xf88400, 0xf88400, 0xf88800, 0xf88800, 0xf88800, 0xf88800, 0xf08810, 0xf08810, 0xa87830, 0xf0bc70, 0xf8fcd8, 0xf8fce0, 0xf0fcf0, 0xf0f8e8, 0xf0fcf0, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f0c8, 0xd0b890, 0xc08038, 0xc08438, 0xd88428, 0xd88828, 0xd08830, 0xd08830, 0xc08838, 0xc08438,
+ 0xb88450, 0xe8b078, 0xf8f0d0, 0xf8f8d8, 0xf8f8f0, 0xf8fcf0, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8f8e8, 0xf8f4e0, 0xf0f8f0, 0xf8f8f0, 0xe8fcf8, 0xe8fcf8,
+ 0xf8fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf8, 0xf0f8f8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8e8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xf8f0d0, 0xf8bc88, 0xb88048,
+ 0xf88800, 0xf88808, 0xf88800, 0xf88800, 0xf88800, 0xf88800, 0xf88c18, 0xf88c18, 0xb88840, 0xf0c078, 0xf8f0d0, 0xf8f8d8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f0c8, 0xd8b080, 0xe08028, 0xe08428, 0xf88408, 0xf88410, 0xf88810, 0xf88410, 0xe88418, 0xe88018,
+ 0xc88c48, 0xf8b878, 0xf8f8d8, 0xf8f8d8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf8f4e8, 0xf8f8f0, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f8d8, 0xf8fcd8,
+ 0xf8f4d0, 0xf8f4d0, 0xf8f8e0, 0xf8fce8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8f4e8, 0xf8f8f0, 0xf8fce8, 0xf8f8e0, 0xf8f4d8, 0xf8f4d8,
+ 0xf8f8e0, 0xf8fce8, 0xf8f8d8, 0xf8f0d0, 0xf8f0d0, 0xf8f8d8, 0xf8fce8, 0xf8f4e0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e0, 0xf8f0d0, 0xf8c088, 0xc08850,
+ 0xf88810, 0xf88810, 0xf88800, 0xf88800, 0xf88800, 0xf88400, 0xf88c10, 0xf88c10, 0xc89048, 0xf8c078, 0xf8ecd0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f4c8, 0xd8b088, 0xe88830, 0xe88c30, 0xf88c10, 0xf89018, 0xf89018, 0xf88c18, 0xf08c28, 0xf08c20,
+ 0xd09050, 0xf8bc78, 0xf8f8d8, 0xf8f8d8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f4d8, 0xf8f4d0,
+ 0xf8f4d0, 0xf8f8d8, 0xf8f0d8, 0xf8f0d8, 0xf8fcf0, 0xf8fcf0, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8f8f0, 0xf8fcf0, 0xf8fce8, 0xf8f4e0, 0xf8f4d0, 0xf8f8d8,
+ 0xf8f4e0, 0xf8fce8, 0xf8f8d8, 0xf8f8d8, 0xf8f4d0, 0xf8f4d0, 0xf8f8e8, 0xf8f8e0, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e0, 0xf8f4d8, 0xf0c088, 0xb08048,
+ 0xf88410, 0xf88810, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88810, 0xf08810, 0xb88438, 0xf0bc78, 0xf8f0d0, 0xf8f8d8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8f4f8, 0xf8fcf8, 0xf8f0c8, 0xe0ac88, 0xf08020, 0xf08420, 0xf88000, 0xf88000, 0xf88400, 0xf88000, 0xf88410, 0xf88008,
+ 0xc88c48, 0xf0b878, 0xf8f8d8, 0xf8fce0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf0, 0xf8ece0, 0xf8d4a8, 0xf8c898, 0xf8c890, 0xf8cc90, 0xf8c890, 0xf8c488,
+ 0xf8cc98, 0xf8d8a0, 0xe0c4a0, 0xe8c8a0, 0xf8f4e0, 0xf8fce8, 0xf0f0e8, 0xf8fcf8, 0xf8f4f0, 0xf8f8f0, 0xf8f8e8, 0xf8f4e0, 0xf8e0b8, 0xf0cca8, 0xf8cc98, 0xf8d4a0,
+ 0xf0d0a8, 0xf0d0a0, 0xf8d098, 0xf8d4a0, 0xf8cc98, 0xf0c490, 0xe8dcc0, 0xf8fce0, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fce8, 0xf8fce8, 0xf0cc98, 0xb89060,
+ 0xe08028, 0xe88428, 0xf88410, 0xf88810, 0xf88408, 0xf88400, 0xf08418, 0xe88010, 0xc08c48, 0xf8c488, 0xf8f4e0, 0xf8f8e0, 0xf8f8f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8f4f8, 0xf8fcf8, 0xf8f0c8, 0xe0ac80, 0xf08420, 0xf88428, 0xf88000, 0xf88000, 0xf88400, 0xf88400, 0xf88410, 0xf88410,
+ 0xc89050, 0xf8c080, 0xf8fce0, 0xf8fce0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf0, 0xe0d8d0, 0xd0a878, 0xb89060, 0xc89050, 0xd09858, 0xd09860, 0xc89458,
+ 0xb89060, 0xc89c68, 0xa88860, 0xb89870, 0xf8e8d8, 0xf8fce8, 0xf0f0e8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f8e8, 0xf8e8d8, 0xd8bc98, 0xb89870, 0xb88c58, 0xc89c68,
+ 0xc0a478, 0xb09468, 0xc09460, 0xc8a068, 0xb89460, 0xb08850, 0xc0b898, 0xf8f8d8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fce8, 0xf8fce8, 0xf8d8a8, 0xd0ac78,
+ 0xe08020, 0xe08428, 0xf88410, 0xf88818, 0xf88808, 0xf88400, 0xe88018, 0xe87c10, 0xe0a868, 0xf8d498, 0xf8f8e0, 0xf8f8e0, 0xf8f8f8, 0xf8fcf8, 0xf8f4f0, 0xf8f4f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xc89840, 0xe8c060, 0xf8fce8, 0xf8fce8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8f8c8, 0xf0e0b0, 0xf8a430, 0xd88410, 0xf87c00, 0xf88400, 0xf88030, 0xf88030,
+ 0xf88408, 0xf88408, 0xd88420, 0xe09030, 0xf8e4a8, 0xf8f4b8, 0xf8fce0, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f8c8, 0xf8ecb8, 0xf8b060, 0xd88c38, 0xf08010, 0xf88418,
+ 0xf88410, 0xf88410, 0xf08808, 0xf08808, 0xd89420, 0xc07c08, 0xd8bc78, 0xf8f4a8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcd0, 0xf8f8d0,
+ 0xd88450, 0xd88858, 0xe08c28, 0xe08c28, 0xe09028, 0xd88c28, 0xc88c68, 0xc88c60, 0xf8f0e8, 0xf8f0e8, 0xf0fce0, 0xf0fce8, 0xf8fce8, 0xf8fce8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+ 0xf0fcf0, 0xf8fcf8, 0xf0fcf0, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f4e8, 0xf8f4e8, 0xf8f0e8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe0fcf8, 0xe8fcf8,
+ 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f4f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xc09838, 0xe8bc60, 0xf8fce8, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcd0, 0xf8e0b0, 0xf8a430, 0xe08810, 0xf87c00, 0xf88400, 0xf88030, 0xf88030,
+ 0xf88408, 0xf88408, 0xd88420, 0xe08c30, 0xf8e0a8, 0xf8f0b8, 0xf8f8d8, 0xf8f8e0, 0xf8fce0, 0xf8f8e0, 0xf8f4c8, 0xf8e8b8, 0xf8b060, 0xd88838, 0xf07c10, 0xf88418,
+ 0xf88810, 0xf88810, 0xf88c08, 0xf08808, 0xd89420, 0xc88008, 0xd8bc78, 0xf8f4a8, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf0fcf0, 0xf8f8d0, 0xf8f8d0,
+ 0xf8c490, 0xf8b480, 0xf0a440, 0xe09430, 0xe09430, 0xf0a440, 0xf0b890, 0xf8c8a0, 0xf8f0e8, 0xf8f4e8, 0xf0fce8, 0xf0fce8, 0xf8fce8, 0xf8fce8, 0xf8f8f8, 0xf8f4f8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0f8f8, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8fcf0, 0xf0f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+ 0xf8f4f0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f4e8, 0xf8f4e8, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcf8, 0xe8fcf8, 0xe0fcf8, 0xe0fcf8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xd09830, 0xf8bc58, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xe8e8c8, 0xf0a848, 0xd08c28, 0xf88000, 0xf88800, 0xf88020, 0xf88020,
+ 0xf88808, 0xf88808, 0xd88828, 0xe09038, 0xf8e8c0, 0xf8f4c8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xf8f8d8, 0xf8ecc8, 0xf8b060, 0xd88c40, 0xf88010, 0xf88818,
+ 0xf87c10, 0xf87c10, 0xf88410, 0xf88008, 0xf08c28, 0xd87410, 0xf0b480, 0xf8e8b0, 0xf8f4f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f4e8,
+ 0xf8f8f0, 0xf8ece8, 0xf0d8a8, 0xe8cc98, 0xe8cc90, 0xf8d898, 0xf8ecd8, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf0fcf0, 0xf0fcf0, 0xf0fce8, 0xf0fce8, 0xf8f4f8, 0xf8f4f8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xd89830, 0xf8bc58, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xe8e4c8, 0xf0ac48, 0xd88c28, 0xf88000, 0xf88c08, 0xf88420, 0xf88420,
+ 0xf88c08, 0xf88c08, 0xd88c30, 0xe89438, 0xf8e8c0, 0xf8f8d0, 0xe0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe0fcf8, 0xf8fcd8, 0xf8f0d0, 0xf8b460, 0xd89040, 0xf88410, 0xf88c18,
+ 0xf88010, 0xf88010, 0xf88410, 0xf88010, 0xf08c28, 0xd87410, 0xf0b480, 0xf8e8b0, 0xf8f4f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8f4f0, 0xf8f4f0, 0xf8f4c0, 0xf8f4c0, 0xf8f4b8, 0xf8f4b8, 0xf8f4e0, 0xf8f4e0, 0xf8f8f8, 0xf8f8f8, 0xf0fcf0, 0xf0fcf0, 0xf0fce8, 0xf0fce8, 0xf8f4f8, 0xf8f4f8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8,
+ 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f4f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f0, 0xf0f8f0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf0, 0xf0f8f0, 0xf0f8f0, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8,
+ 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe09020, 0xf8b448, 0xf8fce8, 0xf8f8e0, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xe0e8d0, 0xe8ac58, 0xd09038, 0xf88400, 0xf88c08, 0xf88410, 0xf88010,
+ 0xf88808, 0xf88400, 0xd88828, 0xe09038, 0xf8e8c8, 0xf8f8d0, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xd8fcf8, 0xf8fce0, 0xf8ecd0, 0xf8b060, 0xd88c38, 0xf88008, 0xf88810,
+ 0xf88810, 0xf88810, 0xf88c18, 0xf88c10, 0xe09430, 0xc87c18, 0xe0bc80, 0xf8f0b0, 0xf8fcf0, 0xf8fcf0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf0, 0xf0fcf0,
+ 0xe0fcf8, 0xe0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcd8, 0xf8fcd8, 0xf8f8f0, 0xf8f8f0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8f4f8, 0xf8f4f0,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f4e8, 0xf8f4e8, 0xf8f8e0, 0xf8f8e0, 0xf8f8d8, 0xf8f8d8, 0xf8f8e0, 0xf8f8d8, 0xf8f8e0, 0xf8fce8, 0xf8fce8, 0xf8fce8,
+ 0xf8f8f8, 0xf8fcf8, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+ 0xf0fcf8, 0xe8fcf8, 0xe8f8f0, 0xf0fcf0, 0xf8fce8, 0xf8fce8, 0xf8f8d8, 0xf8fce0, 0xf8f8e0, 0xf8f8d8, 0xf8f8e0, 0xf8f8e0, 0xf8f8f0, 0xf8f8f0, 0xf0f8f8, 0xf0f8f8,
+ 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8fce8, 0xf8f8d0, 0xf8f4c8, 0xf8f0c0, 0xf8f8c0,
+ 0xf0f8f0, 0xf0f8f0, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe09020, 0xf8b448, 0xf8fce8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce8, 0xe8e8d0, 0xf0ac58, 0xd09038, 0xf88400, 0xf88808, 0xf88010, 0xf87c10,
+ 0xf88808, 0xf88400, 0xd88830, 0xe09438, 0xf8e8c8, 0xf8f8d8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xe0fcf8, 0xf8fce0, 0xf8f0d0, 0xf8b460, 0xd88c38, 0xf88008, 0xf88810,
+ 0xf88810, 0xf88810, 0xf88c18, 0xf88c10, 0xe09430, 0xc87c18, 0xe0b880, 0xf8ecb0, 0xf8fcf0, 0xf8fcf0, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xe0fcf8, 0xe0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcd8, 0xf8fce0, 0xf8fcf8, 0xf8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8f4f8, 0xf8f4f8,
+ 0xe8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8fce0, 0xf8f4e0, 0xf8f8d8, 0xf8f8d8, 0xf8f8d8, 0xf8f4d8, 0xf8f8e0, 0xf8f8e0, 0xf8fce8, 0xf8fce8,
+ 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf8f4f0, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+ 0xf0fcf8, 0xe8fcf8, 0xf0f8f0, 0xf0fcf8, 0xf8fce8, 0xf8f8e8, 0xf8f4d8, 0xf8f4d8, 0xf8f0d0, 0xf8f4d8, 0xf8f8e0, 0xf8f8e0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f4e0, 0xf8f8e8, 0xf8f4c8, 0xf8f0c8, 0xf8ecb8, 0xf8f4c0,
+ 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe09028, 0xf8b450, 0xf8fce8, 0xf8fce0, 0xf8fcf8, 0xf8fcf8, 0xf8f4d8, 0xf8f4d8, 0xf8f4c0, 0xf8d8a0, 0xf8a448, 0xd88830, 0xf88408, 0xf88810, 0xf88410, 0xf88010,
+ 0xf88808, 0xf88808, 0xe88420, 0xf09028, 0xf8e0a8, 0xf8f0b0, 0xf8f4d0, 0xf8f4d8, 0xf8f8d8, 0xf8f4d8, 0xf8f4b8, 0xf8e8b0, 0xf8b058, 0xe88c30, 0xf88408, 0xf88c18,
+ 0xf88c10, 0xf89010, 0xf89010, 0xf08c10, 0xe09428, 0xc87c10, 0xe8b868, 0xf8ec98, 0xf8f8c8, 0xf8f8c8, 0xf8fcd8, 0xf8fcd8, 0xf8fcd8, 0xf8fcd8, 0xf8fcc8, 0xf8fcc8,
+ 0xf8f4e8, 0xf8f8e8, 0xf8f8d0, 0xf8f8d0, 0xf8f4c8, 0xf8f4c8, 0xf8f4d0, 0xf8f4d0, 0xf8f8f0, 0xf8f8f0, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8f8f0,
+ 0xe8f8f8, 0xf0fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8f8d0, 0xf8f8d0, 0xf8e8b0, 0xf8d8a0, 0xf8c480, 0xf8c478, 0xf8c478, 0xf8c078, 0xf8c480, 0xf8c480, 0xf0c888, 0xf0cc90,
+ 0xf8d8a8, 0xf8e8c0, 0xf8f4c8, 0xf8f4c8, 0xf8f0c8, 0xf8e4b8, 0xf8dcb0, 0xf8e4b8, 0xf8e4b0, 0xf8e4b8, 0xf8e4b8, 0xf8e4b8, 0xf8e4b8, 0xf8e4b8, 0xf8e8b8, 0xf8e8b8,
+ 0xf8fcd8, 0xf8f8d8, 0xf8f8c8, 0xf8f8c8, 0xf8e4a8, 0xf8cc90, 0xf8bc78, 0xf8c078, 0xf8bc78, 0xf8c078, 0xf8c480, 0xf8c488, 0xf0cc98, 0xf8d8a0, 0xf8ecc0, 0xf8f8d0,
+ 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f8e0, 0xf8f8d8, 0xf8f0d0, 0xf8f0c8, 0xf8e0b0, 0xf8d0a0, 0xf8c080, 0xf8c080, 0xf8c078, 0xf8c070, 0xf8bc68, 0xf8c068,
+ 0xf0d4a0, 0xf8e4b0, 0xf8f4c8, 0xf8f8d0, 0xf8f8e0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe09028, 0xf8b450, 0xf8fce0, 0xf8f4e0, 0xf8f8f8, 0xf0f0f0, 0xf8e8c8, 0xf8e8c8, 0xf8e8b0, 0xf8d4a0, 0xf0a040, 0xd88828, 0xf88408, 0xf88c10, 0xf88818, 0xf88818,
+ 0xf88808, 0xf88408, 0xe08020, 0xe88828, 0xf8d8a0, 0xf8e8a8, 0xf8ecc8, 0xf8ecd0, 0xf8f0d0, 0xf8f0d0, 0xf8ecb8, 0xf8e0a8, 0xf8ac50, 0xe88830, 0xf88008, 0xf88810,
+ 0xf88c10, 0xf88c10, 0xf89010, 0xf08c10, 0xe09428, 0xc87c10, 0xe8b868, 0xf8ec98, 0xf8f0c0, 0xf8f0c0, 0xf0f0d0, 0xf0f0d0, 0xf0f0c8, 0xf8f0d0, 0xf8f0c0, 0xf8f0c0,
+ 0xf0f0e8, 0xf0f0e0, 0xf8ecc8, 0xf8e8c8, 0xf8e8b8, 0xf8e8b8, 0xf8ecc8, 0xf8ecd0, 0xf8f8e8, 0xf8f8f0, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8f4e8, 0xf8f0c8, 0xf8ecc8, 0xf8cc90, 0xe8b078, 0xc88440, 0xc08440, 0xc08438, 0xc08038, 0xb88440, 0xb88440, 0xb08850, 0xb08c50,
+ 0xd0ac80, 0xf0cca0, 0xf8e8c0, 0xf8f0c8, 0xf8e8b8, 0xf8d0a8, 0xe8c090, 0xf0c498, 0xe8c090, 0xe8c090, 0xf0c090, 0xf0c498, 0xf0c490, 0xf0c498, 0xf0c498, 0xf0c498,
+ 0xf8f4d8, 0xf8f8d8, 0xf8f4c8, 0xf8e8c0, 0xf0c488, 0xc09c60, 0xc08440, 0xc08440, 0xc88840, 0xc08440, 0xb88440, 0xb88040, 0xb88c58, 0xd0a878, 0xf0d0a8, 0xf8f0c8,
+ 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f8e0, 0xf8f8d8, 0xf8f0c8, 0xf8f0c8, 0xe0b888, 0xc09868, 0xb07c40, 0xb88040, 0xc88438, 0xc88438, 0xc88030, 0xc88030,
+ 0xb09060, 0xd0b480, 0xf0e0b8, 0xf8f4d0, 0xf8f8e0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xd09038, 0xf8b860, 0xf0f0d8, 0xf8f8e0, 0xf8f0d0, 0xb8a080, 0xd07018, 0xe88830, 0xf08018, 0xf08420, 0xf08820, 0xf88c20, 0xf88810, 0xf88810, 0xf88820, 0xf88818,
+ 0xf88818, 0xf88c18, 0xf88c20, 0xf88818, 0xf08820, 0xf08420, 0xe08828, 0xe88828, 0xe08828, 0xe08828, 0xe88828, 0xe88828, 0xf88c20, 0xf88c20, 0xf88c18, 0xf88c18,
+ 0xf88410, 0xf88008, 0xf88010, 0xf88410, 0xf88418, 0xf88018, 0xf88028, 0xf87c20, 0xe88430, 0xe88430, 0xe08438, 0xe08438, 0xe88030, 0xe88030, 0xf07c28, 0xf07c28,
+ 0xe08430, 0xe08830, 0xe88428, 0xe88428, 0xe88420, 0xe88420, 0xd08830, 0xd08430, 0xa08458, 0xf0d4a0, 0xf8fcf0, 0xf8fcf0, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0,
+ 0xf8f8e8, 0xf8f0e0, 0xf8ecc8, 0xf8e8c0, 0xf8b878, 0xc88440, 0xd87410, 0xf08828, 0xf88818, 0xf88410, 0xf88408, 0xf88810, 0xf88810, 0xf08410, 0xf08418, 0xf08c18,
+ 0xe89030, 0xd88020, 0xd88428, 0xf8c060, 0xf8e080, 0xf8a848, 0xd07818, 0xe89030, 0xe88c28, 0xe08420, 0xe08420, 0xe08828, 0xe08420, 0xe08420, 0xe89028, 0xf8a038,
+ 0xf8f0a0, 0xf8dc90, 0xf8b058, 0xd08830, 0xd87c18, 0xe88c28, 0xf88c18, 0xf88818, 0xf88410, 0xf88410, 0xf88818, 0xf88818, 0xe88820, 0xe88420, 0xd88020, 0xd88020,
+ 0xf0e0b8, 0xf8fcd8, 0xf8f4c0, 0xf8f0b8, 0xf8f4a8, 0xf8d488, 0xf09838, 0xe08c28, 0xe88418, 0xf08818, 0xf08418, 0xf08010, 0xe88418, 0xf08820, 0xf08c28, 0xe88820,
+ 0xf09830, 0xd87c10, 0xc88028, 0xf8a858, 0xf8dca0, 0xf8f0b8, 0xf8f8d8, 0xf8fcd8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xd08c38, 0xf8b460, 0xe8e8d0, 0xf8f4d8, 0xf8e8c8, 0xb89c78, 0xd07018, 0xe88830, 0xf08420, 0xf08820, 0xf88820, 0xf88c20, 0xf88810, 0xf88810, 0xf88818, 0xf88818,
+ 0xf88410, 0xf88410, 0xf88818, 0xf88418, 0xe88420, 0xe88020, 0xe08428, 0xe08428, 0xe08428, 0xe08428, 0xe88020, 0xe88420, 0xf08418, 0xf08818, 0xf88410, 0xf88410,
+ 0xf88410, 0xf88410, 0xf88010, 0xf88010, 0xf88418, 0xf88418, 0xf88428, 0xf88028, 0xe88030, 0xe88030, 0xe08038, 0xe08038, 0xe87c30, 0xe87c30, 0xf07828, 0xf07828,
+ 0xd88028, 0xd88428, 0xe88028, 0xe88028, 0xe88020, 0xe08020, 0xd08430, 0xc88030, 0xa88858, 0xf0d0a0, 0xf8f8f0, 0xf8fcf0, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf0f8f0,
+ 0xf8ecd8, 0xf8f8e8, 0xf8e4c0, 0xd0b088, 0xc88440, 0xd08c48, 0xf08c28, 0xe88828, 0xf88410, 0xf88410, 0xf88408, 0xf88810, 0xf88c18, 0xf88810, 0xf08c18, 0xf89020,
+ 0xd88428, 0xd88020, 0xd08020, 0xf09c40, 0xf8b858, 0xf09838, 0xd07c18, 0xe08c28, 0xe88c28, 0xe08828, 0xe08828, 0xe88c28, 0xe88c28, 0xe08820, 0xf09430, 0xf8a440,
+ 0xf8dc90, 0xf0b468, 0xd88c38, 0xd08830, 0xe88c28, 0xe88c28, 0xf88818, 0xf88818, 0xf88410, 0xf88410, 0xf88818, 0xf88818, 0xe88828, 0xe88820, 0xe08828, 0xe08428,
+ 0xb8a480, 0xd8cca0, 0xf8ecb8, 0xf8f4c0, 0xf8d488, 0xe8ac60, 0xe89030, 0xe88c30, 0xf08418, 0xf08818, 0xf88818, 0xf88818, 0xf08820, 0xf08c20, 0xf08820, 0xe88420,
+ 0xe88c20, 0xe08018, 0xd08430, 0xe09440, 0xe0b878, 0xf8dca0, 0xf8f8d8, 0xf8f8d8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xd89448, 0xf8bc68, 0xf0f4d8, 0xf8fce0, 0xf8f0b8, 0xd0a470, 0xf87400, 0xf88c10, 0xf88000, 0xf88000, 0xf88818, 0xf88818, 0xf88818, 0xf88818, 0xf88c20, 0xf88c20,
+ 0xf88818, 0xf88c18, 0xf88c18, 0xf88c18, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88c18, 0xf88c18, 0xf89018, 0xf88c18,
+ 0xf88810, 0xf88810, 0xf88410, 0xf88410, 0xf88418, 0xf88418, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18, 0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88408, 0xf88408,
+ 0xf88800, 0xf88800, 0xf88400, 0xf88408, 0xf88808, 0xf88408, 0xf88c10, 0xf88c10, 0xc88c40, 0xf8d080, 0xf8f0e0, 0xf8f8e8, 0xf8f8f8, 0xf0f4f8, 0xf8fce8, 0xf0f8e8,
+ 0xf8f0b8, 0xf8d098, 0xe8a058, 0xd08438, 0xe87c20, 0xf08c28, 0xf88810, 0xf88410, 0xf88400, 0xf88000, 0xf88000, 0xf88000, 0xf88400, 0xf88000, 0xf88400, 0xf88800,
+ 0xf88808, 0xf89010, 0xf88808, 0xf88808, 0xf89818, 0xf89418, 0xf88808, 0xf89010, 0xf88808, 0xf88400, 0xf88408, 0xf88c08, 0xf88808, 0xf88800, 0xf88c08, 0xf89818,
+ 0xf8a840, 0xe88c20, 0xe07c08, 0xf08818, 0xf89018, 0xf88810, 0xf88000, 0xf88808, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88408, 0xf88408,
+ 0xb87c30, 0xd09448, 0xf8cc78, 0xf8d880, 0xf8a038, 0xe08018, 0xf88408, 0xf88808, 0xf88400, 0xf88400, 0xf88400, 0xf88808, 0xf88c10, 0xf88810, 0xf88818, 0xf88818,
+ 0xf88000, 0xf88000, 0xf88810, 0xf08008, 0xd08c40, 0xf8c070, 0xf8f8d0, 0xf8f4d0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xd09040, 0xf8b468, 0xe8ecd0, 0xf8f8e0, 0xf8ecb8, 0xd0a070, 0xf87000, 0xf88808, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88818, 0xf88818, 0xf88820, 0xf88820,
+ 0xf88410, 0xf88418, 0xf88810, 0xf88810, 0xf88408, 0xf88408, 0xf88400, 0xf88400, 0xf88808, 0xf88808, 0xf88808, 0xf88808, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88818,
+ 0xf88c10, 0xf88810, 0xf88410, 0xf88410, 0xf88418, 0xf88418, 0xf88818, 0xf88418, 0xf88c18, 0xf88c18, 0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88408, 0xf88408,
+ 0xf88800, 0xf88800, 0xf88400, 0xf88400, 0xf88408, 0xf88400, 0xf88c10, 0xf88c10, 0xc89440, 0xf8d080, 0xf8f4e0, 0xf8f8e8, 0xf8f8f8, 0xf0f0f8, 0xf8fce8, 0xf8f8e8,
+ 0xf8e4a8, 0xe0a468, 0xc07830, 0xd08840, 0xf89430, 0xf08c28, 0xf88008, 0xf88c10, 0xf88400, 0xf88000, 0xf88400, 0xf88400, 0xf88800, 0xf88800, 0xf88800, 0xf88800,
+ 0xf88808, 0xf89418, 0xf88c10, 0xf88000, 0xf88808, 0xf88c08, 0xf88808, 0xf88c10, 0xf88c10, 0xf88c08, 0xf88c08, 0xf89010, 0xf88c08, 0xf88808, 0xf88808, 0xf89010,
+ 0xe08820, 0xe88c20, 0xf09020, 0xf09020, 0xf88810, 0xf88810, 0xf88808, 0xf88808, 0xf88400, 0xf88400, 0xf88400, 0xf88400, 0xf88408, 0xf88408, 0xf88808, 0xf88808,
+ 0xc89048, 0xc88c40, 0xf0a050, 0xf8ac58, 0xf09028, 0xe08018, 0xf88810, 0xf88810, 0xf88c08, 0xf88808, 0xf88408, 0xf88808, 0xf88c10, 0xf88810, 0xf88818, 0xf88c20,
+ 0xf88800, 0xf88800, 0xf89420, 0xf08008, 0xc87c30, 0xf8ac60, 0xf8f0c8, 0xf8f8d0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xe08c38, 0xf8b058, 0xe8ecd0, 0xf8f8e0, 0xf8ecb8, 0xc8a470, 0xf87000, 0xf88c08, 0xf88408, 0xf88408, 0xf88828, 0xf88828, 0xf88818, 0xf88818, 0xf88810, 0xf88810,
+ 0xf88410, 0xf88410, 0xf88410, 0xf88410, 0xf88408, 0xf88408, 0xf88408, 0xf88808, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810,
+ 0xf89410, 0xf09010, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88c18, 0xf88c18, 0xf88c18, 0xf09010, 0xf09010, 0xf09408, 0xf09408, 0xf89010, 0xf89010, 0xf88c10, 0xf88c10,
+ 0xf88800, 0xf88800, 0xf88010, 0xf88010, 0xf88008, 0xf87c08, 0xf08808, 0xf08808, 0xc89038, 0xf8d078, 0xf8f4e0, 0xf8f8e8, 0xf8f4f8, 0xf8f0f0, 0xf8fce0, 0xf8f8d8,
+ 0xf89430, 0xf89028, 0xf88c20, 0xf08818, 0xf88410, 0xf88810, 0xf88810, 0xf88808, 0xf88400, 0xf88408, 0xf88808, 0xf88810, 0xf88c18, 0xf89018, 0xf89020, 0xf88c18,
+ 0xf88410, 0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88810, 0xf88410, 0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88c10, 0xf88810, 0xf88408, 0xf88408, 0xf88808,
+ 0xe08c28, 0xe89028, 0xe89028, 0xe88820, 0xe88018, 0xe88420, 0xf88820, 0xf88c20, 0xf88c18, 0xf88c18, 0xf88810, 0xf88810, 0xf88408, 0xf88408, 0xf88808, 0xf88808,
+ 0xf89420, 0xf88818, 0xf08418, 0xf88c20, 0xf89830, 0xf09028, 0xe88c20, 0xe88c20, 0xf09028, 0xf08c28, 0xf88c20, 0xf89020, 0xf88c18, 0xf88818, 0xf88410, 0xf88410,
+ 0xf88800, 0xf88400, 0xf88c10, 0xf88410, 0xc88030, 0xe09848, 0xf0d8b0, 0xf8f4d0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xe09038, 0xf8b860, 0xf0f4d8, 0xf8fce0, 0xf8f4c0, 0xc8a870, 0xf87400, 0xf88c10, 0xf88000, 0xf88000, 0xf88828, 0xf88828, 0xf88818, 0xf88818, 0xf88810, 0xf88810,
+ 0xf88818, 0xf88818, 0xf88c18, 0xf88818, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88c10,
+ 0xf09410, 0xf09010, 0xf88c10, 0xf88c10, 0xf88c18, 0xf88c18, 0xf88c18, 0xf88810, 0xf09010, 0xf09010, 0xf09408, 0xf09408, 0xf89010, 0xf89010, 0xf88c10, 0xf88c10,
+ 0xf88c00, 0xf88c00, 0xf88410, 0xf88810, 0xf88410, 0xf88410, 0xf88c10, 0xf88c08, 0xc88830, 0xf8d078, 0xf8f0e0, 0xf8f4e0, 0xf8f8f8, 0xf8f8f8, 0xf8f8d8, 0xe8e4c0,
+ 0xe07c18, 0xe88820, 0xf88c20, 0xf08818, 0xf88410, 0xf88c18, 0xf88810, 0xf88408, 0xf88808, 0xf88808, 0xf88408, 0xf88408, 0xf88410, 0xf88810, 0xf88418, 0xf08010,
+ 0xf88810, 0xf88408, 0xf88410, 0xf89018, 0xf89018, 0xf88810, 0xf88810, 0xf88c18, 0xf88408, 0xf88408, 0xf88808, 0xf88810, 0xf88810, 0xf88808, 0xf88808, 0xf88408,
+ 0xf09830, 0xe08820, 0xd88018, 0xe08420, 0xe88820, 0xe88420, 0xf08018, 0xf08418, 0xf88818, 0xf88818, 0xf88810, 0xf88810, 0xf88808, 0xf88408, 0xf88408, 0xf88408,
+ 0xf89020, 0xf89020, 0xf08818, 0xf08818, 0xf09028, 0xe88c20, 0xd88018, 0xd87c18, 0xe07c18, 0xe08418, 0xf08418, 0xf08818, 0xf88410, 0xf88818, 0xf88410, 0xf88410,
+ 0xf88800, 0xf88000, 0xf88810, 0xf88c10, 0xd88c40, 0xd88c38, 0xd8bc90, 0xf8f4c8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xe88420, 0xf8ac48, 0xe8ecd0, 0xf0f4d8, 0xf8ecc0, 0xb0a070, 0xe06c00, 0xf88410, 0xf88018, 0xf88018, 0xf08838, 0xf08c38, 0xf88818, 0xf88820, 0xf88800, 0xf88800,
+ 0xf88008, 0xf88008, 0xf88410, 0xf88010, 0xf88018, 0xf08018, 0xe88020, 0xe88420, 0xe88020, 0xe88018, 0xf08010, 0xf08018, 0xf88008, 0xf88410, 0xf88408, 0xf88008,
+ 0xf88810, 0xf88810, 0xf88018, 0xf88418, 0xf88020, 0xf88020, 0xf88018, 0xf87c18, 0xf07c08, 0xf87c08, 0xf08008, 0xf08008, 0xf07c18, 0xf07c18, 0xf87c20, 0xf87c20,
+ 0xe89020, 0xe89020, 0xf08838, 0xf08838, 0xf88428, 0xf88428, 0xe88c18, 0xe08c10, 0xc09038, 0xf8d480, 0xf8f0e0, 0xf8f0e0, 0xf8f8f0, 0xf8f8f0, 0xf8e4b8, 0xc8b888,
+ 0xf88408, 0xf87800, 0xf87800, 0xf88808, 0xf89010, 0xf88808, 0xf88410, 0xf88810, 0xf89020, 0xf88c20, 0xe88828, 0xe88420, 0xe08c30, 0xe89440, 0xe89848, 0xe89448,
+ 0xf88c28, 0xf07818, 0xf07810, 0xf88420, 0xf88420, 0xf88018, 0xf88018, 0xf88418, 0xf88418, 0xf88420, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88418,
+ 0xd89840, 0xe09840, 0xe8a050, 0xf8b060, 0xf8b868, 0xf8b468, 0xf0a050, 0xe89448, 0xe88c38, 0xe88c38, 0xf08c30, 0xf08c30, 0xf88c20, 0xf88c20, 0xf88818, 0xf88810,
+ 0xf88400, 0xf88000, 0xf89018, 0xf88810, 0xd08428, 0xe09438, 0xe0ac68, 0xd8a860, 0xe0a868, 0xe0ac68, 0xe89c48, 0xd88c38, 0xf07c18, 0xf88820, 0xf88810, 0xf88408,
+ 0xf88800, 0xf88800, 0xf88818, 0xf89020, 0xd89850, 0xc08440, 0xc0ac88, 0xf8f8d8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88018, 0xf88418, 0xf88400, 0xf88400, 0xf88818, 0xf88818, 0xf88410, 0xf88410,
+ 0xe88828, 0xf8b050, 0xf0f0d8, 0xf8fce0, 0xf8f4c8, 0xb8a478, 0xe07000, 0xf88410, 0xf87c10, 0xf88010, 0xe88830, 0xf08c38, 0xf88820, 0xf88820, 0xf88800, 0xf88800,
+ 0xf88410, 0xf88810, 0xf88818, 0xf88818, 0xf88420, 0xf88418, 0xf08420, 0xf08420, 0xf08420, 0xe88420, 0xf08418, 0xf88818, 0xf88810, 0xf88c18, 0xf88c10, 0xf88810,
+ 0xf88810, 0xf88808, 0xf88418, 0xf88418, 0xf88020, 0xf87c20, 0xf87c18, 0xf87818, 0xf88410, 0xf88410, 0xf88810, 0xf88810, 0xf88418, 0xf88418, 0xf88020, 0xf88020,
+ 0xe89428, 0xe89428, 0xf88c38, 0xf88c38, 0xf88830, 0xf88828, 0xe89018, 0xe89018, 0xc89440, 0xf8d880, 0xf8f0e0, 0xf8f0e0, 0xf8f8f0, 0xf8f4f0, 0xe0d0a0, 0xa09060,
+ 0xf88000, 0xf88400, 0xf88808, 0xf88c10, 0xf88808, 0xf88008, 0xf88410, 0xf88c18, 0xf88818, 0xf08418, 0xe88420, 0xe88828, 0xf09840, 0xf8b058, 0xf8c070, 0xf8c070,
+ 0xf8a440, 0xf88c28, 0xf88420, 0xf88820, 0xf88820, 0xf88820, 0xf88c28, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88820, 0xf88418, 0xf88018,
+ 0xd08830, 0xf8b860, 0xf8e090, 0xf8ec98, 0xf8eca0, 0xf8eca0, 0xf8d080, 0xf8ac60, 0xe08830, 0xe08c38, 0xf08c30, 0xf08c30, 0xf89020, 0xf88c20, 0xf88c18, 0xf88c18,
+ 0xf88800, 0xf87400, 0xf89018, 0xf88c10, 0xc87820, 0xf8ac50, 0xf8f4b0, 0xf8eca8, 0xf8f4b0, 0xf8eca8, 0xf8c470, 0xe09440, 0xf07810, 0xf88418, 0xf88808, 0xf88008,
+ 0xf88000, 0xf88c00, 0xf88818, 0xf88818, 0xd09450, 0xb87c38, 0xb8a080, 0xf8f8d8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xd88830, 0xf8ac50, 0xf0fcf0, 0xe8f4f0, 0xf8f8f0, 0xf0f0e8, 0xf0f4e8, 0xf0f0e8, 0xf8f0c8, 0xe8d8b8, 0xf89c28, 0xe88410, 0xf88410, 0xf88c18, 0xf88418, 0xf88410,
+ 0xf88410, 0xf88408, 0xe08420, 0xe88c28, 0xe0e8a0, 0xf0f4a8, 0xf8e8d8, 0xf8ecd8, 0xf8f0e8, 0xf8ece8, 0xf0fcc8, 0xe8f0c0, 0xf8ac58, 0xe08838, 0xf88000, 0xf88808,
+ 0xf88808, 0xf88c10, 0xf88c10, 0xf88810, 0xf88828, 0xe07010, 0xd0bc78, 0xf8f0a8, 0xf8f4e0, 0xf0f0d8, 0xf8f0e8, 0xf8f0e8, 0xe0fcc0, 0xe8fcc8, 0xf8c480, 0xd08848,
+ 0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88808, 0xf88408, 0xf88810, 0xf88410, 0xc08848, 0xf8d088, 0xf8fce8, 0xf8f8e0, 0xf8f8d0, 0xf8ecc8, 0xf8b460, 0xd88c38,
+ 0xf88010, 0xf88010, 0xf87c08, 0xf88010, 0xf88010, 0xf88010, 0xf88820, 0xf88420, 0xd89c50, 0xc08438, 0xc0b078, 0xf8ecb8, 0xf8fce0, 0xf8f8d8, 0xf8f8e0, 0xf8fce8,
+ 0xf8f0c8, 0xf8ecc8, 0xf8d8a0, 0xd0a470, 0xd08430, 0xe09440, 0xf88c20, 0xf89028, 0xf88410, 0xf88810, 0xf88010, 0xf88008, 0xf88818, 0xf88418, 0xf87c18, 0xf87c18,
+ 0xd0d4a8, 0xf0f0c8, 0xf8f4e8, 0xf8f8f0, 0xf8f4f8, 0xf8f0f0, 0xf8fce0, 0xf8f4d0, 0xd8a450, 0xc89440, 0xf88010, 0xf88c18, 0xf88408, 0xf88008, 0xf88c10, 0xf88c10,
+ 0xf88808, 0xf88400, 0xd08838, 0xd08838, 0xe8dcb8, 0xf8f4d8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf0, 0xf8f4e0, 0xf8fcd8, 0xf0e4c8, 0xe09c50, 0xc88438, 0xf88400, 0xf88800,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe09038, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8d0, 0xe8dcb8, 0xf8a030, 0xe88410, 0xf88410, 0xf88c18, 0xf88418, 0xf88410,
+ 0xf88810, 0xf88410, 0xe08828, 0xe89030, 0xe8f0a8, 0xf8fcb0, 0xf8f0e0, 0xf8f8e8, 0xf8f4f0, 0xf8f0f0, 0xf8fcd0, 0xf0f4c8, 0xf8b060, 0xe08c38, 0xf88408, 0xf88c10,
+ 0xf88808, 0xf88c10, 0xf88c10, 0xf88810, 0xf88828, 0xe07010, 0xd0c078, 0xf8f0a8, 0xf8fce8, 0xf8f8e0, 0xf8f8f0, 0xf8f8f0, 0xe8fcc8, 0xf0fcd0, 0xf8cc88, 0xd89050,
+ 0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88808, 0xf88408, 0xf88810, 0xf88410, 0xc08848, 0xf8cc88, 0xf8fce0, 0xf8f8e0, 0xf8f4d0, 0xf8e0b8, 0xf8a850, 0xd88c38,
+ 0xf88010, 0xf88010, 0xf87c08, 0xf88010, 0xf88010, 0xf88010, 0xf88420, 0xf88420, 0xc88840, 0xe8ac60, 0xf0e4b0, 0xf8f4c0, 0xf8f8d8, 0xf8fcd8, 0xf8fce8, 0xf0f4e0,
+ 0xf8fcd8, 0xf8f4d0, 0xf8e8b0, 0xf0c490, 0xe8a050, 0xd89040, 0xe87c10, 0xf88818, 0xf88410, 0xf88810, 0xf88410, 0xf88008, 0xf88820, 0xf88418, 0xf87c18, 0xf88018,
+ 0xe0e0b8, 0xf8f8d0, 0xf8f4e8, 0xf8f8f0, 0xf8f4f8, 0xf8f0f0, 0xf8fce0, 0xf8f4d8, 0xe8b460, 0xd09c48, 0xf88010, 0xf88818, 0xf88408, 0xf88008, 0xf88c10, 0xf88c10,
+ 0xf88800, 0xf88000, 0xd08c40, 0xd89848, 0xf0e0c0, 0xf8f4d8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8f4e0, 0xf8fce0, 0xf8e8c8, 0xe8a050, 0xc88438, 0xf88400, 0xf88800,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe09030, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a430, 0xf08418, 0xf88410, 0xf88c10, 0xf88418, 0xf88418,
+ 0xf88410, 0xf88010, 0xe08428, 0xe89030, 0xe8f0a8, 0xf8fcb8, 0xf8f4e8, 0xf8f8e8, 0xf8f4f8, 0xf8f0f0, 0xf8fcd8, 0xf0f4c8, 0xf8b060, 0xe08c38, 0xf88008, 0xf88810,
+ 0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88828, 0xe07010, 0xd0c078, 0xf8f0b0, 0xf8fce8, 0xf8f4e8, 0xf8f8f8, 0xf8f8f0, 0xe8fcd0, 0xf0fcd0, 0xf8c888, 0xd88c50,
+ 0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88818, 0xf08418, 0xc08c48, 0xf8cc88, 0xf8f8e0, 0xf8f8e0, 0xf8f0c0, 0xf0d0a0, 0xf09838, 0xe88c28,
+ 0xf88810, 0xf88810, 0xf88408, 0xf88810, 0xf88410, 0xf88410, 0xf08828, 0xf08420, 0xd09858, 0xf8d490, 0xf8fcd8, 0xf8f8d0, 0xf0f4e8, 0xf8fce8, 0xf8fcf8, 0xf0f4f0,
+ 0xf8fce8, 0xf0f4e0, 0xf8f8d0, 0xf8ecc0, 0xf8c880, 0xe09c50, 0xe07810, 0xf88c28, 0xf88410, 0xf88810, 0xf88410, 0xf88408, 0xf88c20, 0xf88418, 0xf08018, 0xf88020,
+ 0xe8f0c8, 0xf8fcd8, 0xf8f4f0, 0xf8f4f0, 0xf8f4f8, 0xf8f0f8, 0xf8fce0, 0xf8f8d8, 0xf8c878, 0xd8a858, 0xf88010, 0xf88418, 0xf88008, 0xf88008, 0xf88c10, 0xf89018,
+ 0xf88808, 0xf88000, 0xd89448, 0xf0ac60, 0xf8ecd0, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0f4e8, 0xf8fce0, 0xf8f0d0, 0xf0ac60, 0xd08c40, 0xf88400, 0xf88400,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c30, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a430, 0xe88410, 0xf88408, 0xf88c10, 0xf88418, 0xf88418,
+ 0xf88010, 0xf88008, 0xe08420, 0xe88c30, 0xe8f0a8, 0xf8fcb8, 0xf8f4e8, 0xf8f8e8, 0xf8f4f8, 0xf8f0f0, 0xf8fcd8, 0xf0f4c8, 0xf8b060, 0xe08838, 0xf87c08, 0xf88410,
+ 0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88828, 0xe07010, 0xd0c078, 0xf8f4b0, 0xf8f8e8, 0xf8f4e0, 0xf8f8f0, 0xf8f4f0, 0xe8fcc8, 0xe8fcd0, 0xf8c888, 0xd88848,
+ 0xf88408, 0xf88408, 0xf88810, 0xf88810, 0xf88810, 0xf88810, 0xf88818, 0xf08418, 0xc89050, 0xf8d090, 0xf8f8e0, 0xf8fce0, 0xf8f0c0, 0xe0c498, 0xe88c28, 0xe89030,
+ 0xf88c10, 0xf88c10, 0xf88810, 0xf88810, 0xf88410, 0xf88410, 0xf08428, 0xe88020, 0xf8c888, 0xf8e8a8, 0xf8f8d0, 0xf8f4d0, 0xf8fcf0, 0xf8fcf0, 0xf0f4f0, 0xf8fcf8,
+ 0xf8f8e8, 0xf0f4e0, 0xf8f8d0, 0xf8f8d0, 0xf8e098, 0xf0ac60, 0xe87c10, 0xf89028, 0xf88410, 0xf88810, 0xf88410, 0xf88410, 0xf88c20, 0xf88418, 0xf88020, 0xf88420,
+ 0xf0f4d0, 0xf8fcd8, 0xf8f0e8, 0xf8f4f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xf8f8d8, 0xf8d888, 0xe0ac58, 0xf88010, 0xf88010, 0xf88008, 0xf88008, 0xf88c10, 0xf88c10,
+ 0xf88808, 0xf87c00, 0xd89850, 0xf8c478, 0xf8f4d8, 0xf8f8d8, 0xf8f8e8, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8, 0xf8fce0, 0xf8f4d8, 0xf8b868, 0xd09048, 0xf88000, 0xf88400,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a030, 0xe88010, 0xf88010, 0xf88c18, 0xf88418, 0xf88418,
+ 0xf88818, 0xf88410, 0xe08428, 0xe89030, 0xe8f4b0, 0xf8fcc0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xf0f4d0, 0xf8b060, 0xe08c40, 0xf88010, 0xf88810,
+ 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88828, 0xe07010, 0xd0c080, 0xf8f4b8, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xf0fcd8, 0xf8c890, 0xd88c50,
+ 0xf88008, 0xf88408, 0xf88810, 0xf88c10, 0xf88c10, 0xf88810, 0xf08820, 0xf08420, 0xc89450, 0xf8d090, 0xf8f8d8, 0xf8fce0, 0xf8f0b8, 0xe0b880, 0xe88010, 0xf89028,
+ 0xf89010, 0xf89010, 0xf88c10, 0xf88c10, 0xf88818, 0xf88810, 0xe88830, 0xe08428, 0xf8e8b0, 0xf8ecb8, 0xf8f0d8, 0xf8f4e0, 0xf8fcf8, 0xf8fcf8, 0xe8f4f8, 0xf8fcf8,
+ 0xe8fcf8, 0xe8fcf8, 0xf8fce8, 0xf8f8e0, 0xf8e8a8, 0xf0b478, 0xe07810, 0xf08820, 0xf88810, 0xf88810, 0xf88410, 0xf88810, 0xf88c20, 0xf88818, 0xf08420, 0xf88c28,
+ 0xf0f4d0, 0xf8fcd8, 0xf8f0f0, 0xf8f8f8, 0xf8f8f8, 0xf8f4f8, 0xf8fce8, 0xf0f8d8, 0xf8e090, 0xd8ac60, 0xf08010, 0xf88418, 0xf88410, 0xf88410, 0xf88c18, 0xf88c18,
+ 0xf88808, 0xf87c00, 0xd89c58, 0xf8d088, 0xf8f8e0, 0xf8fce0, 0xf0f8f0, 0xf8fcf8, 0xf0fcf0, 0xf0f8f0, 0xf8fce8, 0xf8f4e0, 0xf8c078, 0xd09450, 0xf88000, 0xf88408,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a030, 0xe88010, 0xf88010, 0xf88c18, 0xf88418, 0xf88418,
+ 0xf88418, 0xf88010, 0xe08428, 0xe88c30, 0xe8f0b0, 0xf8fcc0, 0xf8f4e8, 0xf8f4e8, 0xf8f4f8, 0xf8f0f8, 0xf8fcd8, 0xf0f4c8, 0xf8b060, 0xe08840, 0xf88008, 0xf88810,
+ 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88828, 0xe07010, 0xd0c080, 0xf8f4b8, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf0fcd8, 0xf0fcd8, 0xf8c890, 0xd88c50,
+ 0xf88008, 0xf88408, 0xf88810, 0xf88c10, 0xf88c10, 0xf88810, 0xf08820, 0xf08420, 0xc08c50, 0xf8d090, 0xf8f4d8, 0xf8f8d8, 0xf8f0b8, 0xe0b480, 0xe07808, 0xf89020,
+ 0xf89010, 0xf89010, 0xf88c10, 0xf88c10, 0xf88818, 0xf88810, 0xe88830, 0xe08428, 0xf8e0a8, 0xf8f0b8, 0xf8fce8, 0xf8f4e0, 0xf0fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0f4f8,
+ 0xe8fcf8, 0xe8fcf8, 0xf8f8e0, 0xf8f0d8, 0xf8e8a8, 0xf0b878, 0xe07810, 0xf08820, 0xf88810, 0xf88810, 0xf88408, 0xf88410, 0xf88c20, 0xf88418, 0xf08820, 0xf89430,
+ 0xf0f0d0, 0xf8fcd8, 0xf8f0f0, 0xf8f8f8, 0xf8f8f8, 0xf8f4f8, 0xf8fce8, 0xf8f8e0, 0xf8e090, 0xd8ac60, 0xf07c10, 0xf88418, 0xf88410, 0xf88410, 0xf88c18, 0xf88c18,
+ 0xf88808, 0xf87c00, 0xd89c58, 0xf8d890, 0xf8f4d8, 0xf8fce0, 0xf0f8f0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8fce0, 0xf8f4d8, 0xf8c480, 0xd89850, 0xf87c00, 0xf88808,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c30, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a030, 0xe88418, 0xf88410, 0xf88c18, 0xf88418, 0xf88418,
+ 0xf88018, 0xf87c10, 0xd88028, 0xe08830, 0xe8f0b0, 0xf8fcc0, 0xf8f0f0, 0xf8f4f0, 0xf8f4f8, 0xf8f0f8, 0xf8fcd8, 0xe8f0d0, 0xf8ac60, 0xe08840, 0xf87c10, 0xf88418,
+ 0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fcd8, 0xf8c490, 0xd08858,
+ 0xf88008, 0xf88408, 0xf88810, 0xf88818, 0xf88c18, 0xf88818, 0xf08820, 0xe88820, 0xb88c50, 0xf8d498, 0xf8f4d8, 0xf8f8d8, 0xf8f0b8, 0xe0b880, 0xe07408, 0xf89020,
+ 0xf88c10, 0xf88c10, 0xf88810, 0xf88c10, 0xf88818, 0xf88418, 0xe88830, 0xe08430, 0xf8e4b0, 0xf8f0b8, 0xf8fce8, 0xf8fce8, 0xe0f8f8, 0xf0fcf8, 0xe8fcf8, 0xe0f8f8,
+ 0xe0fcf8, 0xe0fcf8, 0xf8f4e8, 0xf8f4e8, 0xf8ecb0, 0xf0b880, 0xe07810, 0xf89028, 0xf88810, 0xf88810, 0xf88408, 0xf88810, 0xf89020, 0xf88818, 0xf08c28, 0xf89c38,
+ 0xf0f4d8, 0xf8fce0, 0xf8f4f0, 0xf8fcf8, 0xf8fcf8, 0xf0f4f8, 0xf8fce8, 0xf8fce8, 0xf8e898, 0xd8b068, 0xf07c18, 0xf88420, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18,
+ 0xf88810, 0xf87c00, 0xd89c58, 0xf8d898, 0xf0f0d8, 0xf8fce8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8f0e0, 0xf8c880, 0xd89c58, 0xf87c00, 0xf88c10,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b458, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e4c0, 0xf8a030, 0xf08418, 0xf88410, 0xf88c18, 0xf88418, 0xf88418,
+ 0xf88418, 0xf88010, 0xd88028, 0xe88c38, 0xf0f4b8, 0xf8fcc8, 0xf8f8f0, 0xf8f8f0, 0xf8f4f8, 0xf8f0f8, 0xf8fce0, 0xf0f4d0, 0xf8b068, 0xe08c40, 0xf88010, 0xf88818,
+ 0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xf0fcd8, 0xf8c890, 0xd08858,
+ 0xf88008, 0xf88408, 0xf88810, 0xf88818, 0xf88c18, 0xf88818, 0xf08820, 0xe88820, 0xb88c50, 0xf8d898, 0xf8f8d8, 0xf8f8d8, 0xf8f4b8, 0xe8bc80, 0xe87808, 0xf89020,
+ 0xf88c10, 0xf88c10, 0xf88810, 0xf88c10, 0xf88818, 0xf88818, 0xe88830, 0xe08830, 0xf8f4c0, 0xf8e8b0, 0xf8fce8, 0xf8fce8, 0xe0f8f8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8,
+ 0xe0fcf8, 0xe0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8f0b8, 0xe8b078, 0xd06c08, 0xf89030, 0xf88810, 0xf88810, 0xf88408, 0xf88408, 0xf89020, 0xf88818, 0xf09028, 0xf8a040,
+ 0xf0f8d8, 0xf8fce0, 0xf8f4f0, 0xf8fcf8, 0xf8f8f8, 0xf0f0f8, 0xf8fce8, 0xf8fce8, 0xf8eca0, 0xd8b468, 0xf08018, 0xf88420, 0xf88410, 0xf88410, 0xf88c18, 0xf89020,
+ 0xf88810, 0xf87c00, 0xd89c58, 0xf8d890, 0xf0e8d0, 0xf8fce8, 0xf0fcf8, 0xf0f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8f8e0, 0xf0f0d8, 0xf8c888, 0xd89c58, 0xf87c00, 0xf88c10,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88410, 0xf88818, 0xf88418, 0xf88820,
+ 0xf88418, 0xf88018, 0xd88430, 0xe89038, 0xe8f0b8, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c48, 0xf88010, 0xf88818,
+ 0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xf0fce0, 0xf8c898, 0xd08858,
+ 0xf88408, 0xf88408, 0xf88818, 0xf88c18, 0xf88c18, 0xf88818, 0xf08820, 0xf08820, 0xb88848, 0xf8dca0, 0xf8f4d8, 0xf8f8e0, 0xf8f8c8, 0xe8c890, 0xe88820, 0xf09028,
+ 0xf88818, 0xf88410, 0xf88010, 0xf88810, 0xf88818, 0xf88018, 0xe88428, 0xe88028, 0xf8cc88, 0xf8e4a0, 0xf8f8d0, 0xf8fcd8, 0xf0fcf8, 0xe8fcf0, 0xe0f8f8, 0xf0fcf8,
+ 0xf0fcf8, 0xe8f4f0, 0xf8f4d8, 0xf8fce0, 0xf8e0a0, 0xe8a868, 0xe88018, 0xf08420, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf09028, 0xf89c38,
+ 0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce8, 0xf8eca0, 0xd0b068, 0xe87c18, 0xf08820, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18,
+ 0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88410, 0xf88818, 0xf88418, 0xf88820,
+ 0xf88418, 0xf88018, 0xd88430, 0xe89038, 0xe8f0b8, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c48, 0xf88010, 0xf88818,
+ 0xf88810, 0xf88c10, 0xf88818, 0xf88410, 0xf88830, 0xe07018, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xf0fce0, 0xf8c898, 0xd08858,
+ 0xf88408, 0xf88408, 0xf88818, 0xf88c18, 0xf88c18, 0xf88818, 0xf08820, 0xf08820, 0xc08c50, 0xf8dca0, 0xf8f4d8, 0xf8f4d8, 0xf8f8c0, 0xf0cc98, 0xf89430, 0xf08c28,
+ 0xf88818, 0xf88810, 0xf88410, 0xf88410, 0xf88418, 0xf88010, 0xf08428, 0xf08428, 0xc09450, 0xf8d898, 0xf8fcd8, 0xf8f8d0, 0xe0f4e8, 0xf0fcf8, 0xf0fcf8, 0xe0f0f0,
+ 0xe8f4f0, 0xf0fcf8, 0xf8fce0, 0xf8f0d0, 0xf8c078, 0xd89850, 0xf08420, 0xf89028, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf09028, 0xf89c38,
+ 0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce8, 0xf8eca0, 0xd0b068, 0xe87c18, 0xf08820, 0xf88418, 0xf88410, 0xf88c18, 0xf88c18,
+ 0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88418, 0xf88820,
+ 0xf88418, 0xf88010, 0xd88430, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c40, 0xf88010, 0xf88818,
+ 0xf88c10, 0xf88c10, 0xf88810, 0xf88410, 0xf88830, 0xe07418, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xe8fce0, 0xf8c898, 0xd08858,
+ 0xf88410, 0xf88810, 0xf88818, 0xf88c18, 0xf88810, 0xf88810, 0xf88818, 0xf88410, 0xc08848, 0xf8d490, 0xf8f8e0, 0xf8f8e0, 0xf8f8d8, 0xf8e8c0, 0xf8b868, 0xd89040,
+ 0xf88418, 0xf88418, 0xf88010, 0xf88418, 0xf88010, 0xf87c10, 0xf88420, 0xf88420, 0xc88030, 0xf0ac58, 0xf8e4a0, 0xf8f8b8, 0xf8f8c8, 0xf8fcc8, 0xf8fcd0, 0xf8f4c8,
+ 0xf8f4c8, 0xf8f8c8, 0xf8f0b0, 0xf8c888, 0xe89840, 0xd88430, 0xf08418, 0xf89020, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf08c28, 0xf89c38,
+ 0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf0fce8, 0xf0fce8, 0xf8eca0, 0xd0b468, 0xe87c10, 0xf08820, 0xf88418, 0xf88010, 0xf88818, 0xf88c18,
+ 0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8f8e8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b058, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xf0e0c0, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88418, 0xf88820,
+ 0xf88418, 0xf88010, 0xd88430, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8fce0, 0xe8f4d0, 0xf8b068, 0xe08c40, 0xf88010, 0xf88818,
+ 0xf88c10, 0xf88c10, 0xf88810, 0xf88410, 0xf88830, 0xe07418, 0xd0c080, 0xf8f4b8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fce0, 0xe8fce0, 0xf8c898, 0xd08858,
+ 0xf88410, 0xf88810, 0xf88818, 0xf88c18, 0xf88810, 0xf88810, 0xf88818, 0xf88410, 0xb88440, 0xf8d088, 0xf8fce8, 0xf8fce8, 0xf8fcd8, 0xf8f4d0, 0xf8c470, 0xc88430,
+ 0xf88018, 0xf88018, 0xf88010, 0xf88418, 0xf88410, 0xf87c10, 0xf88420, 0xf88820, 0xe09c48, 0xc88430, 0xc8ac68, 0xf8e0a0, 0xf8f0c0, 0xf8e8b8, 0xf8f0c8, 0xf8f8d0,
+ 0xf8f0c0, 0xf8e8b8, 0xf8c488, 0xd0a060, 0xd88830, 0xe08c38, 0xf88c20, 0xf88c20, 0xf88810, 0xf88c10, 0xf88810, 0xf88410, 0xf88c20, 0xf88818, 0xf08c28, 0xf89c38,
+ 0xf0f4d8, 0xf8fce0, 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf0fce8, 0xf0fce8, 0xf8eca0, 0xd0b468, 0xe87c10, 0xf08820, 0xf88418, 0xf88010, 0xf88818, 0xf88c18,
+ 0xf88810, 0xf07800, 0xd89c58, 0xf8d090, 0xf8f4e8, 0xf8f8e8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd09858, 0xf88008, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+ 0xf88410, 0xf88410, 0xd88428, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b068, 0xe08c40, 0xf88008, 0xf88810,
+ 0xf88c08, 0xf88c08, 0xf88810, 0xf88808, 0xf88828, 0xe07410, 0xd0c080, 0xf8f8b0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+ 0xf88410, 0xf88818, 0xf88818, 0xf88c18, 0xf88808, 0xf88808, 0xf88408, 0xf88408, 0xc88840, 0xf8d088, 0xf8fcf0, 0xf8fce8, 0xf0f8e8, 0xf0f8e0, 0xf8d098, 0xb08c50,
+ 0xe88018, 0xf08418, 0xf88410, 0xf88c18, 0xf88810, 0xf88408, 0xf88410, 0xf88818, 0xf89428, 0xe88018, 0xd88828, 0xe89c40, 0xf0a858, 0xf8c068, 0xf8c878, 0xf8b868,
+ 0xf8b858, 0xf8a448, 0xe88828, 0xe08420, 0xf08818, 0xf89020, 0xf89018, 0xf88c18, 0xf88810, 0xf88c18, 0xf88818, 0xf88410, 0xf88c20, 0xf88818, 0xf88c20, 0xf89830,
+ 0xf8f4d0, 0xf8fcd8, 0xf8f8f8, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce0, 0xf8eca0, 0xd0b460, 0xe87c10, 0xf88818, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+ 0xf88810, 0xf87800, 0xd89c58, 0xf8cc90, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+ 0xf88410, 0xf88410, 0xd88428, 0xe89038, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b068, 0xe08c40, 0xf88008, 0xf88810,
+ 0xf88c08, 0xf88c08, 0xf88810, 0xf88808, 0xf88828, 0xe07410, 0xd0c080, 0xf8f8b0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+ 0xf88410, 0xf88818, 0xf88818, 0xf88c18, 0xf88808, 0xf88808, 0xf88408, 0xf88408, 0xc89048, 0xf8d088, 0xf0fce8, 0xf0fce8, 0xf0f8e0, 0xf8fce8, 0xf8e8b0, 0xe0bc80,
+ 0xe88418, 0xf08418, 0xf88410, 0xf88c18, 0xf88810, 0xf88410, 0xf88810, 0xf88818, 0xf08c20, 0xf08c20, 0xd89030, 0xd08828, 0xd08430, 0xe09440, 0xf09848, 0xe09040,
+ 0xe09030, 0xd88828, 0xe08420, 0xe88c28, 0xf89028, 0xf89020, 0xf88810, 0xf88c18, 0xf88810, 0xf88c18, 0xf88818, 0xf88410, 0xf88c20, 0xf88818, 0xf88c20, 0xf89830,
+ 0xf8f4d0, 0xf8fcd8, 0xf8f8f8, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce8, 0xf0fce0, 0xf8eca0, 0xd0b460, 0xe87c10, 0xf88818, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+ 0xf88810, 0xf87800, 0xd89c58, 0xf8cc90, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+ 0xf88410, 0xf88410, 0xd88428, 0xe89030, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b060, 0xe08c40, 0xf88008, 0xf88810,
+ 0xf88c08, 0xf88c08, 0xf88c08, 0xf88808, 0xf88c28, 0xe07410, 0xd0c480, 0xf8f8b0, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+ 0xf08818, 0xf88818, 0xf88818, 0xf88c18, 0xf88800, 0xf88400, 0xf88400, 0xf88000, 0xc88c40, 0xf8cc88, 0xf0fce8, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcd0, 0xf8f0c8,
+ 0xe89428, 0xe09020, 0xf88810, 0xf88c18, 0xf88808, 0xf88808, 0xf88408, 0xf88808, 0xf88810, 0xf88c10, 0xf88818, 0xf88418, 0xf88018, 0xf07810, 0xf87410, 0xf88020,
+ 0xf88808, 0xf88808, 0xf88c10, 0xf89010, 0xf89010, 0xf88808, 0xf88408, 0xf88808, 0xf88810, 0xf88c18, 0xf88818, 0xf88418, 0xf88820, 0xf88818, 0xf88820, 0xf89830,
+ 0xf8f4d0, 0xf8f8d8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce0, 0xf0fce0, 0xf8ec98, 0xd0b460, 0xf07c10, 0xf88418, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+ 0xf88810, 0xf87800, 0xd89c58, 0xf8cc88, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8f8d8, 0xd0b490, 0xf88418, 0xf88418, 0xf88400, 0xf88808, 0xf88c18, 0xf88c18, 0xf88810, 0xf88408,
+ 0xe08c38, 0xf8b060, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf0e0c8, 0xf8a038, 0xe88418, 0xf88010, 0xf88818, 0xf88420, 0xf88820,
+ 0xf88410, 0xf88410, 0xd88428, 0xe89030, 0xe8f4b0, 0xf8fcc0, 0xf8f4e8, 0xf8f8f0, 0xf8f8f8, 0xf8f4f8, 0xf8fcd8, 0xe8f8d0, 0xf8b060, 0xe08c40, 0xf88008, 0xf88810,
+ 0xf88c08, 0xf88c08, 0xf88c08, 0xf88808, 0xf88c28, 0xe07410, 0xd0c480, 0xf8f8b0, 0xf8fcf0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xe8fcd8, 0xe8fce0, 0xf8c898, 0xd08858,
+ 0xf08818, 0xf88818, 0xf88818, 0xf88c18, 0xf88800, 0xf88400, 0xf88400, 0xf88000, 0xc88840, 0xf8d088, 0xf0fcf0, 0xf8fcf0, 0xf0fcf8, 0xe8f8f0, 0xf8f4c8, 0xf8f4d0,
+ 0xf09c30, 0xe89428, 0xf08810, 0xf88810, 0xf88408, 0xf88408, 0xf88408, 0xf88408, 0xf88008, 0xf89018, 0xf88818, 0xf08010, 0xf88820, 0xf88828, 0xf87c18, 0xf88420,
+ 0xf88808, 0xf88c08, 0xf88808, 0xf88408, 0xf88408, 0xf88810, 0xf88c10, 0xf88810, 0xf88810, 0xf88c18, 0xf88818, 0xf88418, 0xf88820, 0xf88818, 0xf88820, 0xf89830,
+ 0xf8f4d0, 0xf8f8d8, 0xf8f8f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fce0, 0xf0fce0, 0xf8ec98, 0xd0b460, 0xf07c10, 0xf88418, 0xf88410, 0xf88010, 0xf88818, 0xf88c18,
+ 0xf88810, 0xf87800, 0xd89c58, 0xf8cc88, 0xf8f4e0, 0xf8f8e8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fce8, 0xf8f4e0, 0xf8c480, 0xd89858, 0xf87c08, 0xf88810,
+ 0xf88418, 0xf88c20, 0xf88400, 0xf88408, 0xf09020, 0xe07c10, 0xc0a878, 0xf8f8c8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf8f4e8, 0xf8fcf0, 0xf8f4c0, 0xe0b078, 0xf08420, 0xf08420, 0xf88000, 0xf88000, 0xf88400, 0xf88400, 0xf88410, 0xf88410,
+ 0xe08c48, 0xf8b878, 0xf8f4d8, 0xf8f4d8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8d0, 0xf8dcb0, 0xf8a040, 0xe08020, 0xf88000, 0xf88808, 0xf88400, 0xf88400,
+ 0xf88400, 0xf88c00, 0xd88418, 0xe89428, 0xf8e4b0, 0xf8f8c0, 0xe8f4e0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f8d0, 0xf8e8c0, 0xf8b058, 0xe08830, 0xf87c08, 0xf88810,
+ 0xf88400, 0xf88000, 0xf88808, 0xf87c00, 0xe88820, 0xe07c18, 0xe0b880, 0xf8f0b8, 0xf0fcf0, 0xf0f8f0, 0xe8fcf8, 0xe8fcf8, 0xf8fce0, 0xf8f8d8, 0xf8c888, 0xd09458,
+ 0xf88008, 0xf88408, 0xf88400, 0xf88400, 0xf88400, 0xf88000, 0xf08418, 0xf08418, 0xc89458, 0xf8cc90, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+ 0xf8e4b8, 0xe0a070, 0xc87430, 0xe08448, 0xf89030, 0xf88020, 0xf87800, 0xf88c10, 0xf88000, 0xf88000, 0xf88400, 0xf88800, 0xf88c10, 0xf88808, 0xf09018, 0xf89018,
+ 0xf88808, 0xf88000, 0xf88810, 0xf89018, 0xf88810, 0xf07c08, 0xf88410, 0xf88c18, 0xf88808, 0xf88808, 0xf88400, 0xf88400, 0xf88808, 0xf88000, 0xf88810, 0xf8a028,
+ 0xf8f4d8, 0xf8f8d8, 0xf8f4f0, 0xf8f8f0, 0xf8f4f8, 0xf8f0f8, 0xf8f4d8, 0xf8f4d8, 0xf8e890, 0xe8b460, 0xe08c10, 0xe08c10, 0xf88408, 0xf88808, 0xf88418, 0xf88018,
+ 0xf08c10, 0xe07c00, 0xe09c40, 0xf8d078, 0xf8f0c8, 0xf8f0c8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0f4f8, 0xf8f0c8, 0xf8ecc8, 0xf8c870, 0xe09c40, 0xe88408, 0xf08c10,
+ 0xf88400, 0xf89800, 0xf88c08, 0xf88400, 0xe09038, 0xd88430, 0xd8a478, 0xf8ecc0, 0xf8f4e8, 0xf8f4e8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8fcf0, 0xf8f4c0, 0xe0b080, 0xf08420, 0xf08420, 0xf88000, 0xf88400, 0xf88408, 0xf88400, 0xf88410, 0xf88410,
+ 0xd88848, 0xf8b070, 0xf8f0d0, 0xf8f4d8, 0xe8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8d0, 0xf8e0b8, 0xf8a448, 0xe88828, 0xf88408, 0xf88c08, 0xf88400, 0xf88000,
+ 0xf88400, 0xf88800, 0xd88018, 0xe89028, 0xf8e0a8, 0xf8f8c0, 0xe8f4e0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f4c8, 0xf8e8c0, 0xf8ac58, 0xe08830, 0xf87c00, 0xf88410,
+ 0xf88800, 0xf88400, 0xf89010, 0xf88808, 0xf08c28, 0xe07c18, 0xd8b480, 0xf8ecb8, 0xf8fcf8, 0xf0fcf0, 0xe8fcf8, 0xe8f8f8, 0xf8f4d8, 0xf8f4d8, 0xf8c888, 0xc89050,
+ 0xf88408, 0xf88808, 0xf88400, 0xf88800, 0xf88400, 0xf88400, 0xf88818, 0xf08418, 0xc09050, 0xf8c888, 0xf8f8e0, 0xf8f4e0, 0xf0f4e8, 0xf8fcf0, 0xf8fce8, 0xf8fcf0,
+ 0xf8ecc0, 0xf8d4a8, 0xf8a060, 0xd07838, 0xf07410, 0xf88c28, 0xf88808, 0xf87400, 0xf88400, 0xf88000, 0xf88800, 0xf88c00, 0xf89010, 0xf88c08, 0xf09018, 0xf89018,
+ 0xf88408, 0xf88808, 0xf88408, 0xf87c00, 0xf89018, 0xf8a028, 0xf89420, 0xf07800, 0xf88408, 0xf88808, 0xf88000, 0xf88400, 0xf88808, 0xf88000, 0xf88810, 0xf89c28,
+ 0xf8f4d8, 0xf8f8d8, 0xf8f8f0, 0xf8f8f0, 0xf8f4f8, 0xf8f4f8, 0xf8f4d8, 0xf8f4d8, 0xf8e490, 0xe0ac58, 0xd88808, 0xe08c10, 0xf88808, 0xf88810, 0xf88010, 0xf87c10,
+ 0xf89418, 0xe88000, 0xe09c48, 0xf8d078, 0xf8f4d0, 0xf8f4d0, 0xf0f8f8, 0xf0f4f8, 0xf8fcf8, 0xf0f8f8, 0xf8f8d0, 0xf8f0d0, 0xf8c470, 0xe09840, 0xe88408, 0xf89010,
+ 0xf88800, 0xf89000, 0xf88808, 0xf88808, 0xe09038, 0xc87820, 0xd09c70, 0xf8f4c8, 0xf8f8e8, 0xf8f8e8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8f4c8, 0xd0b088, 0xe08430, 0xe08830, 0xf88418, 0xf88818, 0xf88820, 0xf88818, 0xe88828, 0xe88428,
+ 0xc88c50, 0xf8b880, 0xf8f4e0, 0xf8f8e0, 0xe8f8f8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f4d0, 0xf8dcb8, 0xf0a050, 0xd08438, 0xf08420, 0xf89028, 0xf88c18, 0xf88818,
+ 0xf89020, 0xf89020, 0xc88430, 0xe09840, 0xf8e4b8, 0xf8fcd0, 0xe8f8e8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f8d8, 0xf8ecc8, 0xf8b470, 0xd89048, 0xf08428, 0xf89030,
+ 0xf88c20, 0xf08818, 0xf89028, 0xf08820, 0xe08c38, 0xc87420, 0xd0ac80, 0xf8ecc0, 0xf8fcf8, 0xf0fcf0, 0xe8fcf8, 0xe8fcf8, 0xf8f8e0, 0xf8fce8, 0xf8d0a0, 0xc09460,
+ 0xe08828, 0xe88828, 0xf88818, 0xf88818, 0xf88818, 0xf88410, 0xe08830, 0xe08828, 0xb89060, 0xf8d0a0, 0xf8fcf0, 0xf8f8e8, 0xf0f4f0, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8,
+ 0xf8fce0, 0xf0fce0, 0xf0e4b0, 0xc8bc88, 0xc89048, 0xb88440, 0xe08428, 0xe88c30, 0xf88c20, 0xf88820, 0xf88820, 0xf88820, 0xf88828, 0xf88420, 0xf88028, 0xf88428,
+ 0xe89038, 0xd88028, 0xc87c28, 0xf0a050, 0xf8c070, 0xf0a858, 0xc88030, 0xd08430, 0xe08c30, 0xe09038, 0xe08830, 0xe08830, 0xe09038, 0xd88c38, 0xd89040, 0xe8a050,
+ 0xf0f8e0, 0xf8f8e8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f4f8, 0xf8f8e8, 0xf8f8e8, 0xf8eca8, 0xd0b070, 0xc08828, 0xc89030, 0xe89038, 0xe89030, 0xf08438, 0xf08438,
+ 0xd88c30, 0xc87c20, 0xc89858, 0xf8cc90, 0xf8f4e0, 0xf8fce8, 0xf0fcf8, 0xe8f8f8, 0xe8f8f8, 0xf0fcf8, 0xf8fce8, 0xf8f0d8, 0xf0c080, 0xc89458, 0xc88028, 0xd89038,
+ 0xf09020, 0xf08c20, 0xd88830, 0xe09438, 0xc89458, 0xb07838, 0xb89c78, 0xf8f8d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8f8f0, 0xf8fcf8, 0xf8ecc8, 0xd0ac80, 0xe08838, 0xe88c38, 0xf88820, 0xf88c20, 0xf89028, 0xf88c20, 0xf08c30, 0xe88c30,
+ 0xc88850, 0xf0b480, 0xf8f8e0, 0xf8f8e0, 0xf0fcf8, 0xf0fcf8, 0xe8f8f8, 0xf0fcf8, 0xf8f8d8, 0xf8e4c0, 0xf0a858, 0xd08838, 0xf08420, 0xf89028, 0xf89020, 0xf89420,
+ 0xf08c18, 0xf08818, 0xc88028, 0xe09840, 0xf0e0b0, 0xf8fcd0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf8f8d8, 0xf8e8c8, 0xf8b068, 0xd88c48, 0xe88020, 0xf08428,
+ 0xf89428, 0xf88c20, 0xf89028, 0xf08820, 0xe08c38, 0xd07c28, 0xe0b888, 0xf8f8c8, 0xf0f8f0, 0xf0fcf0, 0xf0fcf8, 0xe8fcf8, 0xf8fce8, 0xf8fce8, 0xf8cc98, 0xb88c58,
+ 0xe88c28, 0xe88c30, 0xf88c20, 0xf88c20, 0xf88c18, 0xf88818, 0xe88c30, 0xe88c30, 0xb08858, 0xf8cca0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8f8e8, 0xf8f4e8,
+ 0xf0fcd8, 0xf0f8d8, 0xf8fcc8, 0xf8f8c8, 0xf8c880, 0xc89450, 0xe08428, 0xf89840, 0xf88c28, 0xf88820, 0xf88418, 0xf88820, 0xf88420, 0xf88020, 0xf88020, 0xf88428,
+ 0xe89038, 0xd88428, 0xe89848, 0xf8d888, 0xf8f0a0, 0xe8a050, 0xc07420, 0xe89c48, 0xe08830, 0xe08c38, 0xe08830, 0xe08830, 0xe09038, 0xd88c38, 0xd89040, 0xe89c50,
+ 0xf0f8e8, 0xf8fce8, 0xf8f8f8, 0xf8fcf8, 0xf8f8f8, 0xf8f4f8, 0xf8f8e8, 0xf8f8e8, 0xf8eca8, 0xd0b070, 0xc08c28, 0xc89030, 0xe08c30, 0xe08c30, 0xf08438, 0xf08840,
+ 0xd89038, 0xd08428, 0xd09c60, 0xf8cc90, 0xf8f0d8, 0xf8fce8, 0xf0fcf8, 0xf0fcf8, 0xe8f8f8, 0xf0fcf8, 0xf8f8e0, 0xf8ecd8, 0xf0c080, 0xd09c60, 0xd88c30, 0xe09438,
+ 0xf08c20, 0xe88818, 0xd88828, 0xe09038, 0xc89458, 0xb07c40, 0xb8a080, 0xf8ecd0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8f4d8, 0xd0bca0, 0xc08850, 0xc08850, 0xd88840, 0xd88c40, 0xd08c48, 0xd08c48, 0xc08c50, 0xc08c50,
+ 0xc09470, 0xe0b898, 0xf8f4e8, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f0d8, 0xe8d8c0, 0xd8a870, 0xb88850, 0xc08848, 0xc88c48, 0xc88c48, 0xc88c48,
+ 0xd09048, 0xd09040, 0xb88c50, 0xd0a468, 0xe8dcc0, 0xf8f8e0, 0xf0fcf0, 0xf8fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8f8e0, 0xf8e8d0, 0xe8b888, 0xc89868, 0xd08c50, 0xd08c50,
+ 0xd09450, 0xc88c48, 0xd08c48, 0xc88440, 0xc88c50, 0xb88048, 0xc8b090, 0xf8e4c8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8f8e8, 0xe8ccb0, 0xb09878,
+ 0xb88850, 0xb88c50, 0xc88848, 0xd08848, 0xd08840, 0xd08840, 0xc08850, 0xc08850, 0xb09478, 0xe8d0b0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf0, 0xf8f8f0,
+ 0xd8fcf8, 0xe0fcf8, 0xf0fcf8, 0xe8fcf0, 0xf8f8d8, 0xf0ecd0, 0xe8c898, 0xd0ac80, 0xc88850, 0xc08048, 0xc87838, 0xc87838, 0xd07430, 0xd07430, 0xd87430, 0xd87838,
+ 0xc8a880, 0xe8c8a0, 0xf8e0c0, 0xf8f4d0, 0xf8fce0, 0xe8d4b8, 0xc8b498, 0xe0c8a8, 0xe0c098, 0xe0c4a0, 0xe0c098, 0xe0c098, 0xe0c4a0, 0xd8c0a0, 0xd8c4a8, 0xe0d0b0,
+ 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcd8, 0xd8d8b0, 0xd0c488, 0xd8c890, 0xe8c088, 0xe0bc88, 0xf0bc98, 0xf8c098,
+ 0xe8c4a0, 0xe0bc98, 0xe0ccb0, 0xf8e8d0, 0xf8f8f0, 0xf8fcf0, 0xf0fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8e0c8, 0xe0c8b0, 0xe0bc98, 0xe8c098,
+ 0xf0bc90, 0xf0c090, 0xe0bc90, 0xe8c098, 0xe0c8a8, 0xd0bca0, 0xd0d4c0, 0xf8fce8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf0, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8e0, 0xf0dcc0, 0xf8c488, 0xf8c890, 0xf8c478, 0xf8c880, 0xf8c888, 0xf8c888, 0xf8cc90, 0xf8c890,
+ 0xf0c4a0, 0xf8d4b0, 0xf8f8e8, 0xf8f8e8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e0, 0xf8f4e0, 0xf8d8a0, 0xf8c890, 0xf8c888, 0xf8c888, 0xf8c480, 0xf8c480,
+ 0xf8c478, 0xf8c070, 0xe8c088, 0xf8cc90, 0xf8ecd0, 0xf8fce0, 0xf8fcf8, 0xf0fcf0, 0xf0fcf8, 0xf0f8f8, 0xf8fce8, 0xf8f4e0, 0xf8d4a8, 0xf0c498, 0xf8bc80, 0xf8c080,
+ 0xf8c888, 0xf8c480, 0xf8cc88, 0xf8c480, 0xf8d098, 0xf8c088, 0xf8dcb8, 0xf8f8d8, 0xf8fcf8, 0xf0f8f0, 0xe8fcf8, 0xe8f8f8, 0xf8f4e8, 0xf8f8f0, 0xf8e8c8, 0xe8c8a8,
+ 0xf8c890, 0xf8c890, 0xf8c480, 0xf8c880, 0xf8c478, 0xf8c478, 0xf8c888, 0xf8c488, 0xe0c8a8, 0xf8e8c8, 0xf8fcf8, 0xf8f8f0, 0xf0f8f0, 0xf8fcf8, 0xf8f8f0, 0xf8fcf0,
+ 0xe0fcf8, 0xe0fcf8, 0xe8fcf0, 0xe0f8e8, 0xf8f8d8, 0xf8fce0, 0xf8f0c0, 0xf8d8a8, 0xf8c890, 0xf8c088, 0xf8b878, 0xf8b878, 0xf8b470, 0xf8b070, 0xf8b470, 0xf8b878,
+ 0xf8d8b0, 0xf8f4c8, 0xf8f8d8, 0xf8f0d0, 0xf8f8e0, 0xf8f8e0, 0xf8f0d0, 0xf8f4d0, 0xf8f4d0, 0xf8f8d0, 0xf8f8c8, 0xf8f4c8, 0xf8f8d8, 0xf8f8d0, 0xf8f4d8, 0xf8f8e0,
+ 0xf0fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fce0, 0xf8f8d8, 0xf8fcc0, 0xf8fcc8, 0xf8f4c0, 0xf8f0c0, 0xf8f0c8, 0xf8f0c8,
+ 0xf8f0c8, 0xf8e8c0, 0xf8f4d8, 0xf8f8e0, 0xf8fcf0, 0xf8f8f0, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8, 0xe8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f4d8, 0xf8f4d8, 0xf8ecc8, 0xf8ecc8,
+ 0xf8f0c0, 0xf8f4c8, 0xf8f4c8, 0xf8f0c8, 0xf8f8d8, 0xf8f8d8, 0xf8fce8, 0xf8fce8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8f4e8, 0xf8f0d8, 0xf8f0d8, 0xf8f0c8, 0xf8f0c8, 0xf8f4d8, 0xf8f4d8, 0xf8f8e0, 0xf8f8e0,
+ 0xf8f8e8, 0xf8f4e0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8f4d8, 0xf8f0d0, 0xf8f8d8, 0xf8f8d8, 0xf8f4d8, 0xf8f4e0,
+ 0xf8f4d0, 0xf8f4d0, 0xf8f4d8, 0xf8f4d8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8, 0xf0f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f4e0, 0xf8f0e0, 0xf8f4d8, 0xf8f4d8,
+ 0xf8f0d8, 0xf8f4d8, 0xf8f4d8, 0xf8ecd0, 0xf8f8d8, 0xf8ecd0, 0xf8f4e0, 0xf8f8e0, 0xf8fcf8, 0xf0f4f0, 0xf0f8f8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8f4e0, 0xf8f8e8, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f4d8, 0xf8f4d8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8,
+ 0xf8fcf8, 0xf8f4f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf0f0f0, 0xf8f4e8, 0xf8fcf0, 0xf8fce8, 0xf8f8e0, 0xf8f4d8, 0xf8f8d8, 0xf8f8d0, 0xf8f4c8, 0xf8f8c8, 0xf8fcc8,
+ 0xf8fcf0, 0xf8f8f0, 0xf0f4f0, 0xf0f8f8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8f8f0,
+ 0xf8f8f8, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf8fcf0, 0xf8fcf0, 0xf8f4d8, 0xf8f4d8, 0xf8f4d0, 0xf8f4d0, 0xf8f8d8, 0xf8f8d8, 0xf8f8e0, 0xf8fce0,
+ 0xf8f8e8, 0xf8f0e0, 0xf8fcf8, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8f8e8, 0xf8fcf0, 0xf8fce0, 0xf8f4d8, 0xf8f8d8, 0xf8f8d8, 0xf8f4e0, 0xf8fce0,
+ 0xf8f0d0, 0xf8f4d0, 0xf8f8e0, 0xf8f8e0, 0xf8fcf0, 0xf8f8e8, 0xf0f8f8, 0xf0f8f8, 0xf8fcf8, 0xf0f8f8, 0xf8f8f0, 0xf8fcf0, 0xf8f4e0, 0xf8f4e0, 0xf8f4d8, 0xf8f0d8,
+ 0xf8fce0, 0xf8fce0, 0xf8f8d8, 0xf8ecd0, 0xf8f8d8, 0xf8f4d8, 0xf8f8e0, 0xf8f0e0, 0xf8fcf8, 0xf0f8f0, 0xf0fcf8, 0xf8fcf8, 0xf8f8f8, 0xf0f4f0, 0xf8f8f0, 0xf8fcf0,
+ 0xf8fce8, 0xf8fce8, 0xf8f8e0, 0xf8f8e0, 0xf8f4d8, 0xf8f4d8, 0xf8f8d8, 0xf8f8d8, 0xf8fcf0, 0xf8f8e8, 0xf0f4f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8fcf8,
+ 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f4e8, 0xf8f8f0, 0xf8f8e0, 0xf8f8e0, 0xf8f8d8, 0xf8fce0, 0xf8fcd0, 0xf8fcd0, 0xf8f8c8, 0xf8fcd0,
+ 0xf8f4e8, 0xf8f4e8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf8, 0xf8fcf0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f8, 0xf0fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fce8, 0xf8fce0, 0xf8f4e0, 0xf8fce8, 0xf8fcf0, 0xf8f8e8,
+ 0xf8f8f8, 0xf8f4f0, 0xf8fcf8, 0xf8fcf8, 0xf0f8f0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf0fcf0, 0xf8f8f0, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f8, 0xf8f4f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0f8f8, 0xf0f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0,
+ 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f8, 0xf0f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f4f0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+ 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+ 0xe8fcf8, 0xe8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8,
+ 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+ 0xf0fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8,
+ 0xf8f8e8, 0xf8f8e8, 0xf8f8f0, 0xf8f8f0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+ 0xf0fce8, 0xf0fce8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf0fcf0, 0xf0fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8f8e8, 0xf8f8e8,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8f8f0, 0xf8f8f0,
+ 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0,
+ 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8,
+ 0xf8f8e8, 0xf8f8e8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fce8, 0xf8fce8, 0xf8fcf0, 0xf8fcf0, 0xf0fcf8, 0xf0fcf8,
+ 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f0, 0xf8f8f0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf0fcf0, 0xf0fcf0, 0xf0fcf8, 0xf0fcf8,
+ 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8,
+ 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf0fcf8, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8fcf0, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf0, 0xf0fcf0,
+ 0xf0fce8, 0xf0fce8, 0xf0fcf0, 0xf0fcf0, 0xf8fcf8, 0xf8fcf8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf8, 0xf8fcf0, 0xf8fcf0,
+ 0xf0fce8, 0xf0fce8, 0xf8fcf0, 0xf8fcf0, 0xf8f8f0, 0xf8f8f0, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8f8f8, 0xf8fcf8, 0xf8fcf8, 0xf0fcf8, 0xf0fcf8, 0xe8fcf8, 0xe8fcf8,
+};
+
+
+static __inline void memcpy_2d(UWORD8 *pu1_dest,
+ UWORD32 dest_stride,
+ const UWORD8 *pu1_src,
+ UWORD32 u4_x_pos,
+ UWORD32 u4_y_pos,
+ UWORD32 u4_logo_wd,
+ UWORD32 u4_logo_ht,
+ UWORD32 u4_logo_strd,
+ WORD32 shift)
+{
+ UWORD32 i;
+ UWORD32 j;
+
+ pu1_dest = pu1_dest + u4_x_pos + (u4_y_pos * dest_stride);
+
+ for(i = 0; i < u4_logo_ht; i++)
+ {
+#if 1//!OLD_LOGO
+ if(shift)
+ {
+ WORD32 val;
+ for(j = 0; j < u4_logo_wd; j++)
+ {
+ val = CLIP_U8(pu1_dest[j] + shift);
+ pu1_dest[j] = (pu1_src[j] * val) >> 8;
+ }
+ }
+ else
+ {
+ for(j = 0; j < u4_logo_wd; j++)
+ {
+ pu1_dest[j] = (pu1_src[j] * pu1_dest[j]) >> 8;
+ }
+ }
+
+#else
+ memcpy(pu1_dest, pu1_src, u4_logo_wd);
+#endif
+ pu1_src += u4_logo_strd;
+ pu1_dest += dest_stride;
+ }
+}
+
+void ihevcd_insert_logo(UWORD8 *pu1_buf_y,
+ UWORD8 *pu1_buf_u,
+ UWORD8 *pu1_buf_v,
+ UWORD32 u4_stride,
+ UWORD32 u4_x_pos,
+ UWORD32 u4_y_pos,
+ UWORD32 u4_yuv_fmt,
+ UWORD32 u4_disp_wd,
+ UWORD32 u4_disp_ht)
+{
+
+ UWORD32 u4_logo_wd_y, u4_logo_wd_uv, u4_logo_ht_y, u4_logo_ht_uv;
+ UWORD32 u4_logo_strd_y, u4_logo_strd_uv;
+ UWORD32 u4_stride_y, u4_tride_uv;
+ const UWORD8 *pu1_buf_logo_y, *pu1_buf_logo_u, *pu1_buf_logo_v;
+ UWORD32 u4_x_pos_y, u4_x_pos_uv, u4_y_pos_y, u4_y_pos_uv;
+ WORD32 num_comp = 0;
+ WORD32 shift_y, shift_uv;
+ if((WORD32)u4_x_pos < 0)
+ u4_x_pos = 0;
+
+ if((WORD32)u4_y_pos < 0)
+ u4_y_pos = 0;
+ /*Use the following to blend the logo*/
+ //shift_y = 0;
+ //shift_uv = 128;
+
+ /* These values will do complete fill */
+ shift_y = 256;
+ shift_uv = 256;
+ switch(u4_yuv_fmt)
+ {
+ case IV_YUV_444P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_WD_Y;
+ u4_logo_wd_uv = LOGO_WD_444_UV;
+ u4_logo_ht_y = LOGO_HT_Y;
+ u4_logo_ht_uv = LOGO_HT_444_UV;
+ u4_logo_strd_y = LOGO_WD_Y;
+ u4_logo_strd_uv = LOGO_WD_444_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos;
+
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+ pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride;
+
+ break;
+ case IV_YUV_420P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_WD_Y;
+ u4_logo_wd_uv = LOGO_WD_420_UV;
+ u4_logo_ht_y = LOGO_HT_Y;
+ u4_logo_ht_uv = LOGO_HT_420_UV;
+ u4_logo_strd_y = LOGO_WD_Y;
+ u4_logo_strd_uv = LOGO_WD_420_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos >> 1;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos >> 1;
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_420p_u;
+ pu1_buf_logo_v = gau1_ihevcd_logo_420p_v;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride >> 1;
+ break;
+
+ case IV_YUV_422P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_WD_Y;
+ u4_logo_wd_uv = LOGO_WD_422_UV;
+ u4_logo_ht_y = LOGO_HT_Y;
+ u4_logo_ht_uv = LOGO_HT_422_UV;
+ u4_logo_strd_y = LOGO_WD_Y;
+ u4_logo_strd_uv = LOGO_WD_422_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos >> 1;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos;
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+ pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride >> 1;
+
+
+ break;
+
+ case IV_YUV_411P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_WD_Y;
+ u4_logo_wd_uv = LOGO_WD_411_UV;
+ u4_logo_ht_y = LOGO_HT_Y;
+ u4_logo_ht_uv = LOGO_HT_411_UV;
+ u4_logo_strd_y = LOGO_WD_Y;
+ u4_logo_strd_uv = LOGO_WD_411_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos >> 2;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos;
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+ pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride >> 2;
+
+ break;
+ case IV_RGB_565:
+ num_comp = 1;
+ u4_logo_wd_y = LOGO_WD_RGB565 * 2;
+ u4_logo_wd_uv = 0;
+ u4_logo_ht_y = LOGO_HT_RGB565;
+ u4_logo_ht_uv = 0;
+ u4_logo_strd_y = LOGO_WD_RGB565 * 2;
+ u4_logo_strd_uv = 0;
+
+ u4_x_pos_y = u4_x_pos * 2;
+ u4_x_pos_uv = 0;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = 0;
+
+ pu1_buf_logo_y = (UWORD8 *)gau2_ihevcd_logo_rgb565;
+ pu1_buf_logo_u = NULL;
+ pu1_buf_logo_v = NULL;
+
+ u4_stride_y = u4_stride * 2;
+ u4_tride_uv = 0;
+ shift_y = 256;
+ shift_uv = 256;
+
+ break;
+ case IV_RGBA_8888:
+ num_comp = 1;
+ u4_logo_wd_y = LOGO_WD_RGBA8888 * 4;
+ u4_logo_wd_uv = 0;
+ u4_logo_ht_y = LOGO_HT_RGBA8888;
+ u4_logo_ht_uv = 0;
+ u4_logo_strd_y = LOGO_WD_RGBA8888 * 4;
+ u4_logo_strd_uv = 0;
+
+ u4_x_pos_y = u4_x_pos * 4;
+ u4_x_pos_uv = 0;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = 0;
+
+ pu1_buf_logo_y = (UWORD8 *)gau4_ihevcd_logo_rgb8888;
+ pu1_buf_logo_u = NULL;
+ pu1_buf_logo_v = NULL;
+
+ u4_stride_y = u4_stride * 4;
+ u4_tride_uv = 0;
+ shift_y = 256;
+ shift_uv = 256;
+
+ break;
+ case IV_YUV_420SP_UV:
+
+ num_comp = 2;
+ u4_logo_wd_y = LOGO_WD_Y;
+ u4_logo_wd_uv = LOGO_WD_420SP_UV;
+ u4_logo_ht_y = LOGO_HT_Y;
+ u4_logo_ht_uv = LOGO_HT_420SP_UV;
+ u4_logo_strd_y = LOGO_WD_Y;
+ u4_logo_strd_uv = LOGO_WD_420SP_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos >> 1;
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_420sp_uv;
+ pu1_buf_logo_v = gau1_ihevcd_logo_420sp_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride;
+ break;
+
+ case IV_YUV_420SP_VU:
+ default:
+ num_comp = 2;
+ u4_logo_wd_y = LOGO_WD_Y;
+ u4_logo_wd_uv = LOGO_WD_420SP_VU;
+ u4_logo_ht_y = LOGO_HT_Y;
+ u4_logo_ht_uv = LOGO_HT_420SP_VU;
+ u4_logo_strd_y = LOGO_WD_Y;
+ u4_logo_strd_uv = LOGO_WD_420SP_VU;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos >> 1;
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_420sp_vu;
+ pu1_buf_logo_v = gau1_ihevcd_logo_420sp_vu;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride;
+ break;
+
+
+ }
+ //num_comp = 2;
+ u4_logo_wd_y = MIN(u4_logo_wd_y, u4_disp_wd - u4_x_pos_y);
+ u4_logo_ht_y = MIN(u4_logo_ht_y, u4_disp_ht - u4_y_pos_y);
+ u4_logo_wd_uv = MIN(u4_logo_wd_uv, (u4_disp_wd >> 1) - u4_x_pos_uv);
+ u4_logo_ht_uv = MIN(u4_logo_ht_uv, (u4_disp_ht >> 1) - u4_x_pos_uv);
+ memcpy_2d(pu1_buf_y, u4_stride_y, pu1_buf_logo_y, u4_x_pos_y, u4_y_pos_y, u4_logo_wd_y, u4_logo_ht_y, u4_logo_strd_y, shift_y);
+ if(num_comp > 1)
+ memcpy_2d(pu1_buf_u, u4_tride_uv, pu1_buf_logo_u, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, u4_logo_strd_uv, shift_uv);
+ if(num_comp > 2)
+ memcpy_2d(pu1_buf_v, u4_tride_uv, pu1_buf_logo_v, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, u4_logo_strd_uv, shift_uv);
+
+#if CODEC_LOGO
+ u4_y_pos = u4_y_pos + u4_logo_ht_y;
+ /*Use the following to blend the logo*/
+ //shift_y = 0;
+ //shift_uv = 128;
+ shift_y = 256;
+ shift_yv = 256;
+
+
+ switch(u4_yuv_fmt)
+ {
+ case IV_YUV_444P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y;
+ u4_logo_wd_uv = LOGO_CODEC_WD_444_UV;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = LOGO_CODEC_HT_444_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos;
+
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+ pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride;
+
+ break;
+ case IV_YUV_420P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y;
+ u4_logo_wd_uv = LOGO_CODEC_WD_420_UV;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = LOGO_CODEC_HT_420_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos >> 1;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos >> 1;
+
+ pu1_buf_logo_y = gau1_ihevcd_codec_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_codec_logo_420p_u;
+ pu1_buf_logo_v = gau1_ihevcd_codec_logo_420p_v;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride >> 1;
+ break;
+
+ case IV_YUV_422P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y;
+ u4_logo_wd_uv = LOGO_CODEC_WD_422_UV;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = LOGO_CODEC_HT_422_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos >> 1;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos;
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+ pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride >> 1;
+
+
+ break;
+
+ case IV_YUV_411P:
+ num_comp = 3;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y;
+ u4_logo_wd_uv = LOGO_CODEC_WD_411_UV;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = LOGO_CODEC_HT_411_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos >> 2;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos;
+
+ pu1_buf_logo_y = gau1_ihevcd_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_logo_uv;
+ pu1_buf_logo_v = gau1_ihevcd_logo_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride >> 2;
+
+ break;
+ case IV_RGB_565:
+ num_comp = 1;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y * 2;
+ u4_logo_wd_uv = 0;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = 0;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = 0;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = 0;
+
+ pu1_buf_logo_y = (UWORD8 *)gau2_ihevcd_logo_rgb565;
+ pu1_buf_logo_u = NULL;
+ pu1_buf_logo_v = NULL;
+
+ u4_stride_y = u4_stride * 2;
+ u4_tride_uv = 0;
+ shift_y = 256;
+ shift_uv = 256;
+
+ break;
+ case IV_RGBA_8888:
+ num_comp = 1;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y * 4;
+ u4_logo_wd_uv = 0;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = 0;
+
+ u4_x_pos_y = (u4_x_pos + LOGO_CODEC_WD_Y) * 4;
+ u4_x_pos_uv = 0;
+ u4_y_pos_y = (u4_y_pos - LOGO_CODEC_HT_Y);
+ u4_y_pos_uv = 0;
+
+ pu1_buf_logo_y = (UWORD8 *)gau4_ihevcd_logo_rgb8888;
+ pu1_buf_logo_u = NULL;
+ pu1_buf_logo_v = NULL;
+
+ u4_stride_y = u4_stride * 2;
+ u4_tride_uv = 0;
+ shift_y = 256;
+ shift_uv = 256;
+
+ break;
+ case IV_YUV_420SP_UV:
+
+ num_comp = 2;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y;
+ u4_logo_wd_uv = LOGO_CODEC_WD_420SP_UV;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = LOGO_CODEC_HT_420SP_UV;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos >> 1;
+
+ pu1_buf_logo_y = gau1_ihevcd_codec_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_codec_logo_420sp_uv;
+ pu1_buf_logo_v = gau1_ihevcd_codec_logo_420sp_uv;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride;
+ break;
+
+ case IV_YUV_420SP_VU:
+
+ num_comp = 2;
+ u4_logo_wd_y = LOGO_CODEC_WD_Y;
+ u4_logo_wd_uv = LOGO_CODEC_WD_420SP_VU;
+ u4_logo_ht_y = LOGO_CODEC_HT_Y;
+ u4_logo_ht_uv = LOGO_CODEC_HT_420SP_VU;
+
+ u4_x_pos_y = u4_x_pos;
+ u4_x_pos_uv = u4_x_pos;
+ u4_y_pos_y = u4_y_pos;
+ u4_y_pos_uv = u4_y_pos >> 1;
+
+ pu1_buf_logo_y = gau1_ihevcd_codec_logo_y;
+ pu1_buf_logo_u = gau1_ihevcd_codec_logo_420sp_vu;
+ pu1_buf_logo_v = gau1_ihevcd_codec_logo_420sp_vu;
+
+ u4_stride_y = u4_stride;
+ u4_tride_uv = u4_stride;
+ break;
+ default:
+ break;
+ }
+
+ memcpy_2d(pu1_buf_y, u4_stride_y, pu1_buf_logo_y, u4_x_pos_y, u4_y_pos_y, u4_logo_wd_y, u4_logo_ht_y, shift_y);
+ if(num_comp > 1)
+ memcpy_2d(pu1_buf_u, u4_tride_uv, pu1_buf_logo_u, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, shift_uv);
+ if(num_comp > 2)
+ memcpy_2d(pu1_buf_v, u4_tride_uv, pu1_buf_logo_v, u4_x_pos_uv, u4_y_pos_uv, u4_logo_wd_uv, u4_logo_ht_uv, shift_uv);
+#endif
+
+}
+#endif
diff --git a/decoder/ihevcd_ittiam_logo.h b/decoder/ihevcd_ittiam_logo.h
new file mode 100644
index 0000000..71540e3
--- /dev/null
+++ b/decoder/ihevcd_ittiam_logo.h
@@ -0,0 +1,128 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/* */
+/* File Name : ihevcd_ittiam_logo.h.h */
+/* */
+/* Description : This file contains all the necessary function headers*/
+/* to insert ittiam logo to a yuv buffer. */
+/* */
+/* List of Functions : None */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 10 10 2005 Ittiam Draft */
+/* */
+/*****************************************************************************/
+
+#ifndef LOGO_INSERT_H
+#define LOGO_INSERT_H
+
+//#define LOGO_EN
+
+#define LOGO_WD 90
+#define LOGO_HT 36
+
+#define LOGO_WD_Y LOGO_WD
+#define LOGO_HT_Y LOGO_HT
+
+#define LOGO_WD_RGBA8888 160
+#define LOGO_HT_RGBA8888 64
+
+#define LOGO_WD_RGB565 160
+#define LOGO_HT_RGB565 64
+
+#define LOGO_WD_444_UV LOGO_WD
+#define LOGO_HT_444_UV LOGO_HT
+
+
+#define LOGO_WD_420_UV (LOGO_WD >> 1)
+#define LOGO_HT_420_UV (LOGO_HT >> 1)
+
+#define LOGO_WD_420SP_UV (LOGO_WD)
+#define LOGO_HT_420SP_UV (LOGO_HT >> 1)
+
+#define LOGO_WD_420SP_VU (LOGO_WD)
+#define LOGO_HT_420SP_VU (LOGO_HT >> 1)
+
+#define LOGO_WD_422_UV (LOGO_WD >> 1)
+#define LOGO_HT_422_UV (LOGO_HT)
+
+#define LOGO_WD_422V_UV (LOGO_WD)
+#define LOGO_HT_422V_UV (LOGO_HT >> 1)
+
+#define LOGO_WD_411_UV (LOGO_WD >> 2)
+#define LOGO_HT_411_UV (LOGO_HT)
+
+#define LOGO_CODEC_WD 80
+#define LOGO_CODEC_HT 24
+
+#define LOGO_CODEC_WD_Y LOGO_CODEC_WD
+#define LOGO_CODEC_HT_Y LOGO_CODEC_HT
+
+
+#define LOGO_CODEC_WD_444_UV LOGO_CODEC_WD
+#define LOGO_CODEC_HT_444_UV LOGO_CODEC_HT
+
+
+#define LOGO_CODEC_WD_420_UV (LOGO_CODEC_WD >> 1)
+#define LOGO_CODEC_HT_420_UV (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_420SP_UV (LOGO_CODEC_WD)
+#define LOGO_CODEC_HT_420SP_UV (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_420SP_VU (LOGO_CODEC_WD)
+#define LOGO_CODEC_HT_420SP_VU (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_422_UV (LOGO_CODEC_WD >> 1)
+#define LOGO_CODEC_HT_422_UV (LOGO_CODEC_HT)
+
+#define LOGO_CODEC_WD_422V_UV (LOGO_CODEC_WD)
+#define LOGO_CODEC_HT_422V_UV (LOGO_CODEC_HT >> 1)
+
+#define LOGO_CODEC_WD_411_UV (LOGO_CODEC_WD >> 2)
+#define LOGO_CODEC_HT_411_UV (LOGO_CODEC_HT)
+
+
+
+
+#define START_X_ITT_LOGO 0
+#define START_Y_ITT_LOGO 0
+
+#define WD_ITT_LOGO 128
+#define HT_ITT_LOGO 60
+
+void ihevcd_insert_logo(UWORD8 *buf_y, UWORD8 *buf_u, UWORD8 *buf_v,
+ UWORD32 stride,
+ UWORD32 x_pos,
+ UWORD32 y_pos,
+ UWORD32 yuv_fmt,
+ UWORD32 u4_disp_wd,
+ UWORD32 u4_disp_ht);
+
+#ifdef LOGO_EN
+#define INSERT_LOGO(buf_y, buf_u, buf_v, stride, x_pos, y_pos, yuv_fmt,disp_wd,disp_ht) ihevcd_insert_logo(buf_y, buf_u, buf_v, stride, x_pos, y_pos, yuv_fmt,disp_wd,disp_ht);
+#else
+#define INSERT_LOGO(buf_y, buf_u, buf_v, stride, x_pos, y_pos, yuv_fmt,disp_wd,disp_ht)
+#endif
+
+#endif /* LOGO_INSERT_H */
+
diff --git a/decoder/ihevcd_job_queue.c b/decoder/ihevcd_job_queue.c
new file mode 100644
index 0000000..e926f94
--- /dev/null
+++ b/decoder/ihevcd_job_queue.c
@@ -0,0 +1,593 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_job_queue.c
+*
+* @brief
+* Contains functions for job queue
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_macros.h"
+#include "ihevcd_error.h"
+#include "ihevcd_job_queue.h"
+
+/**
+*******************************************************************************
+*
+* @brief Returns size for job queue context. Does not include job queue buffer
+* requirements
+*
+* @par Description
+* Returns size for job queue context. Does not include job queue buffer
+* requirements. Buffer size required to store the jobs should be allocated in
+* addition to the value returned here.
+*
+* @returns Size of the job queue context
+*
+* @remarks
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_jobq_ctxt_size()
+{
+ WORD32 size;
+ size = sizeof(jobq_t);
+ size += ithread_get_mutex_lock_size();
+ return size;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Locks the jobq conext
+*
+* @par Description
+* Locks the jobq conext by calling ithread_mutex_lock()
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IHEVCD_FAIL if mutex lock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_lock(jobq_t *ps_jobq)
+{
+ WORD32 retval;
+ retval = ithread_mutex_lock(ps_jobq->pv_mutex);
+ if(retval)
+ {
+ return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ }
+ return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Unlocks the jobq conext
+*
+* @par Description
+* Unlocks the jobq conext by calling ithread_mutex_unlock()
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IHEVCD_FAIL if mutex unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_jobq_unlock(jobq_t *ps_jobq)
+{
+ WORD32 retval;
+ retval = ithread_mutex_unlock(ps_jobq->pv_mutex);
+ if(retval)
+ {
+ return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ }
+ return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Yeilds the thread
+*
+* @par Description
+* Unlocks the jobq conext by calling
+* ihevcd_jobq_unlock(), ithread_yield() and then ihevcd_jobq_lock()
+* jobq is unlocked before to ensure the jobq can be accessed by other threads
+* If unlock is not done before calling yield then no other thread can access
+* the jobq functions and update jobq.
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IHEVCD_FAIL if mutex lock unlock or yield fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_yield(jobq_t *ps_jobq)
+{
+
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+ IHEVCD_ERROR_T rettmp;
+ rettmp = ihevcd_jobq_unlock(ps_jobq);
+ RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+#ifdef GPU_CIRCULAR_QUEUE
+ usleep(1000);
+#else
+ //NOP(1024 * 8);
+ ithread_yield();
+#endif
+
+ rettmp = ihevcd_jobq_lock(ps_jobq);
+ RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief free the job queue pointers
+*
+* @par Description
+* Frees the jobq context
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_free(jobq_t *ps_jobq)
+{
+ WORD32 ret;
+ ret = ithread_mutex_destroy(ps_jobq->pv_mutex);
+
+ if(0 == ret)
+ return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ else
+ return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+}
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the job queue
+*
+* @par Description
+* Initializes the jobq context and sets write and read pointers to start of
+* job queue buffer
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @param[in] buf_size
+* Size of the total memory allocated
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+void* ihevcd_jobq_init(void *pv_buf, WORD32 buf_size)
+{
+ jobq_t *ps_jobq;
+ UWORD8 *pu1_buf;
+ pu1_buf = (UWORD8 *)pv_buf;
+
+ ps_jobq = (jobq_t *)pu1_buf;
+ pu1_buf += sizeof(jobq_t);
+ buf_size -= sizeof(jobq_t);
+
+ ps_jobq->pv_mutex = pu1_buf;
+ pu1_buf += ithread_get_mutex_lock_size();
+ buf_size -= ithread_get_mutex_lock_size();
+
+ if(buf_size <= 0)
+ return NULL;
+
+ ithread_mutex_init(ps_jobq->pv_mutex);
+
+ ps_jobq->pv_buf_base = pu1_buf;
+ ps_jobq->pv_buf_wr = pu1_buf;
+ ps_jobq->pv_buf_rd = pu1_buf;
+ ps_jobq->pv_buf_end = pu1_buf + buf_size;
+ ps_jobq->i4_terminate = 0;
+#ifdef GPU_CIRCULAR_QUEUE
+ ps_jobq->i4_wrapped_around = 0;
+#endif
+
+
+ return ps_jobq;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Resets the jobq conext
+*
+* @par Description
+* Resets the jobq conext by initilizing job queue context elements
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IHEVCD_FAIL if lock unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_reset(jobq_t *ps_jobq)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ ret = ihevcd_jobq_lock(ps_jobq);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ ps_jobq->pv_buf_wr = ps_jobq->pv_buf_base;
+ ps_jobq->pv_buf_rd = ps_jobq->pv_buf_base;
+ ps_jobq->i4_terminate = 0;
+#ifdef GPU_CIRCULAR_QUEUE
+ ps_jobq->i4_wrapped_around = 0;
+#endif
+ ret = ihevcd_jobq_unlock(ps_jobq);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Deinitializes the jobq conext
+*
+* @par Description
+* Deinitializes the jobq conext by calling ihevc_jobq_reset()
+* and then destrying the mutex created
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IHEVCD_FAIL if lock unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_deinit(jobq_t *ps_jobq)
+{
+ WORD32 retval;
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+ ret = ihevcd_jobq_reset(ps_jobq);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ retval = ithread_mutex_destroy(ps_jobq->pv_mutex);
+ if(retval)
+ {
+ return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ }
+
+ return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Terminates the jobq
+*
+* @par Description
+* Terminates the jobq by setting a flag in context.
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IHEVCD_FAIL if lock unlock fails else IHEVCD_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_jobq_terminate(jobq_t *ps_jobq)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ ret = ihevcd_jobq_lock(ps_jobq);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ ps_jobq->i4_terminate = 1;
+
+ ret = ihevcd_jobq_unlock(ps_jobq);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief Adds a job to the queue
+*
+* @par Description
+* Adds a job to the queue and updates wr address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed.
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @param[in] pv_job
+* Pointer to the location that contains details of the job to be added
+*
+* @param[in] job_size
+* Size of the job buffer
+*
+* @param[in] blocking
+* To signal if the write is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_queue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ IHEVCD_ERROR_T rettmp;
+ UWORD8 *pu1_buf;
+ UNUSED(blocking);
+
+ rettmp = ihevcd_jobq_lock(ps_jobq);
+ RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_wr;
+#ifdef GPU_CIRCULAR_QUEUE
+ if((UWORD8 *)ps_jobq->pv_buf_end > (pu1_buf + job_size))
+ {
+ memcpy(ps_jobq->pv_buf_wr, pv_job, job_size);
+ ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + job_size;
+
+ }
+ else
+ {
+ /* Handle wrap around case */
+ /* Wait for pv_buf_rd to consume first job_size number of bytes
+ * from the beginning of job queue
+ */
+ //ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ ps_jobq->pv_buf_wr = ps_jobq->pv_buf_base;
+ memcpy(ps_jobq->pv_buf_wr, pv_job, job_size);
+ ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + job_size;
+ //printf("Queue wrapped around\n");
+ ps_jobq->i4_wrapped_around = 1;
+ }
+#else
+ if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + job_size))
+ {
+ memcpy(ps_jobq->pv_buf_wr, pv_job, job_size);
+ ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + job_size;
+ ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ }
+ else
+ {
+ /* Handle wrap around case */
+ /* Wait for pv_buf_rd to consume first job_size number of bytes
+ * from the beginning of job queue
+ */
+ ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ }
+#endif
+
+ ps_jobq->i4_terminate = 0;
+
+ rettmp = ihevcd_jobq_unlock(ps_jobq);
+ RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+ return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief Gets next from the Job queue
+*
+* @par Description
+* Gets next job from the job queue and updates rd address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed. If it is a blocking call and if there is no new job
+* then this functions unlocks the mutext and calls yield and then locks it back.
+* and continues till a job is available or terminate is set
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @param[out] pv_job
+* Pointer to the location that contains details of the job to be written
+*
+* @param[in] job_size
+* Size of the job buffer
+*
+* @param[in] blocking
+* To signal if the read is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_jobq_dequeue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking)
+{
+ IHEVCD_ERROR_T ret;
+ IHEVCD_ERROR_T rettmp;
+ volatile UWORD8 *pu1_buf;
+
+ rettmp = ihevcd_jobq_lock(ps_jobq);
+ RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+#ifdef GPU_CIRCULAR_QUEUE
+ if(((UWORD8 *)ps_jobq->pv_buf_end <= (ps_jobq->pv_buf_rd + job_size)) &&
+ (ps_jobq->i4_wrapped_around == 1))
+ {
+ ps_jobq->pv_buf_rd = ps_jobq->pv_buf_base;
+ ps_jobq->i4_wrapped_around = 0;
+ //printf("DeQueue wrapped around\n");
+ }
+
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+
+ while(1)
+ {
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+ if(((UWORD8 *)ps_jobq->pv_buf_wr >= (pu1_buf + job_size)) ||
+ (ps_jobq->i4_wrapped_around == 1))
+ {
+ memcpy(pv_job, ps_jobq->pv_buf_rd, job_size);
+ ps_jobq->pv_buf_rd = (UWORD8 *)ps_jobq->pv_buf_rd + job_size;
+ ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ break;
+ }
+ else
+ {
+ /* If all the entries have been dequeued, then break and return */
+ if(1 == ps_jobq->i4_terminate)
+ {
+ ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ break;
+ }
+
+ if(1 == blocking)
+ {
+ ihevcd_jobq_yield(ps_jobq);
+ }
+ else
+ {
+ /* If there is no job available,
+ * and this is non blocking call then return fail */
+ ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ break;
+ }
+ }
+ }
+#else
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+
+
+ if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + job_size))
+ {
+ while(1)
+ {
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+ if((UWORD8 *)ps_jobq->pv_buf_wr >= (pu1_buf + job_size))
+ {
+ memcpy(pv_job, ps_jobq->pv_buf_rd, job_size);
+ ps_jobq->pv_buf_rd = (UWORD8 *)ps_jobq->pv_buf_rd + job_size;
+ ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ break;
+ }
+ else
+ {
+ /* If all the entries have been dequeued, then break and return */
+ if(1 == ps_jobq->i4_terminate)
+ {
+ ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ break;
+ }
+
+ if(1 == blocking)
+ {
+ ihevcd_jobq_yield(ps_jobq);
+
+ }
+ else
+ {
+ /* If there is no job available,
+ * and this is non blocking call then return fail */
+ ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Handle wrap around case */
+ /* Wait for pv_buf_rd to consume first job_size number of bytes
+ * from the beginning of job queue
+ */
+ ret = (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ }
+#endif
+ rettmp = ihevcd_jobq_unlock(ps_jobq);
+ RETURN_IF((rettmp != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), rettmp);
+
+ return ret;
+}
diff --git a/decoder/ihevcd_job_queue.h b/decoder/ihevcd_job_queue.h
new file mode 100644
index 0000000..190ca83
--- /dev/null
+++ b/decoder/ihevcd_job_queue.h
@@ -0,0 +1,74 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_job_queue.h
+*
+* @brief
+* Contains functions for job queue
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_JOB_QUEUE_H_
+#define _IHEVCD_JOB_QUEUE_H_
+
+typedef struct
+{
+ /** Pointer to buffer base which contains the jobs */
+ void *pv_buf_base;
+
+ /** Pointer to current address where new job can be added */
+ void *pv_buf_wr;
+
+ /** Pointer to current address from where next job can be obtained */
+ void *pv_buf_rd;
+
+ /** Pointer to end of job buffer */
+ void *pv_buf_end;
+
+ /** Mutex used to keep the functions thread-safe */
+ void *pv_mutex;
+
+ /** Flag to indicate jobq has to be terminated */
+ WORD32 i4_terminate;
+#ifdef GPU_CIRCULAR_QUEUE
+ /** Flag to indicate jobq wrap around */
+ WORD32 i4_wrapped_around;
+#endif
+}jobq_t;
+
+WORD32 ihevcd_jobq_ctxt_size(void);
+void* ihevcd_jobq_init(void *pv_buf, WORD32 buf_size);
+IHEVCD_ERROR_T ihevcd_jobq_free(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_reset(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_deinit(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_terminate(jobq_t *ps_jobq);
+IHEVCD_ERROR_T ihevcd_jobq_queue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking);
+IHEVCD_ERROR_T ihevcd_jobq_dequeue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking);
+
+#endif /* _IHEVCD_PROCESS_SLICE_H_ */
diff --git a/decoder/ihevcd_mv_merge.c b/decoder/ihevcd_mv_merge.c
new file mode 100644
index 0000000..4d5dfbd
--- /dev/null
+++ b/decoder/ihevcd_mv_merge.c
@@ -0,0 +1,938 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_mv_merge.c
+ *
+ * @brief
+ * Contains functions for motion vector merge candidates derivation
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_compare_pu_mv_t()
+ * - ihevcd_mv_pred_merge()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_mv_merge.h"
+/**
+ *******************************************************************************
+ *
+ * @brief Compare Motion vectors function
+ *
+ *
+ * @par Description:
+ * Checks if MVs and Reference idx are excatly matching.
+ *
+ * @param[inout] ps_1
+ * motion vector 1 to be compared
+ *
+ * @param[in] ps_2
+ * motion vector 2 to be compared
+ *
+ * @returns
+ * 0 : if not matching 1 : if matching
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+WORD32 ihevcd_compare_pu_t(pu_t *ps_pu_1, pu_t *ps_pu_2)
+{
+ WORD32 l0_match = 0, l1_match = 0;
+ pu_mv_t *ps_mv_1, *ps_mv_2;
+ WORD32 pred_mode_1, pred_mode_2;
+
+ ps_mv_1 = &ps_pu_1->mv;
+ ps_mv_2 = &ps_pu_2->mv;
+
+ pred_mode_1 = ps_pu_1->b2_pred_mode;
+ pred_mode_2 = ps_pu_2->b2_pred_mode;
+
+ if(pred_mode_1 == pred_mode_2)
+ {
+ if(pred_mode_1 != PRED_L1)
+ {
+ if(ps_mv_1->i1_l0_ref_idx == ps_mv_2->i1_l0_ref_idx)
+ {
+ if(0 == memcmp(&ps_mv_1->s_l0_mv, &ps_mv_2->s_l0_mv, sizeof(mv_t)))
+ {
+ l0_match = 1;
+ }
+ }
+ }
+ if(pred_mode_1 != PRED_L0)
+ {
+ if(ps_mv_1->i1_l1_ref_idx == ps_mv_2->i1_l1_ref_idx)
+ {
+ if(0 == memcmp(&ps_mv_1->s_l1_mv, &ps_mv_2->s_l1_mv, sizeof(mv_t)))
+ {
+ l1_match = 1;
+ }
+ }
+ }
+ if(pred_mode_1 == PRED_BI)
+ return (l1_match && l0_match);
+ else if(pred_mode_1 == PRED_L0)
+ return l0_match;
+ else
+ return l1_match;
+ }
+
+ return 0;
+}
+
+void ihevcd_collocated_mvp(mv_ctxt_t *ps_mv_ctxt,
+ pu_t *ps_pu,
+ mv_t *ps_mv_col,
+ WORD32 *pu4_avail_col_flag,
+ WORD32 use_pu_ref_idx,
+ WORD32 x_col,
+ WORD32 y_col)
+{
+ sps_t *ps_sps = ps_mv_ctxt->ps_sps;
+ slice_header_t *ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+ ref_list_t *ps_ref_list[2];
+ mv_buf_t *ps_mv_buf_col;
+ WORD32 xp_col, yp_col;
+ WORD32 col_ctb_x, col_ctb_y;
+ mv_t as_mv_col[2];
+ WORD32 log2_ctb_size;
+ WORD32 ctb_size;
+ WORD32 avail_col;
+ WORD32 col_ctb_idx, pu_cnt;
+ WORD32 au4_list_col[2];
+ WORD32 num_minpu_in_ctb;
+ UWORD8 *pu1_pic_pu_map_ctb;
+ pu_t *ps_col_pu;
+ WORD32 part_pos_y;
+
+
+ part_pos_y = ps_pu->b4_pos_y << 2;
+
+ log2_ctb_size = ps_sps->i1_log2_ctb_size;
+ ctb_size = (1 << log2_ctb_size);
+
+ avail_col = 1;
+
+ /* Initializing reference list */
+ ps_ref_list[0] = ps_slice_hdr->as_ref_pic_list0;
+ ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list1;
+ if(PSLICE == ps_slice_hdr->i1_slice_type)
+ ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list0;
+
+ if((ps_slice_hdr->i1_slice_type == BSLICE) && (ps_slice_hdr->i1_collocated_from_l0_flag == 0))
+ {
+ /* L1 */
+ ps_mv_buf_col = (mv_buf_t *)ps_ref_list[1][ps_slice_hdr->i1_collocated_ref_idx].pv_mv_buf;
+
+ }
+ else
+ {
+ /* L0 */
+ ps_mv_buf_col = (mv_buf_t *)ps_ref_list[0][ps_slice_hdr->i1_collocated_ref_idx].pv_mv_buf;
+
+ }
+ num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+
+ if(((part_pos_y >> log2_ctb_size) == (y_col >> log2_ctb_size))
+ && ((x_col + (ps_mv_ctxt->i4_ctb_x << log2_ctb_size)) < ps_sps->i2_pic_width_in_luma_samples)
+ && (((y_col + (ps_mv_ctxt->i4_ctb_y << log2_ctb_size))
+ < ps_sps->i2_pic_height_in_luma_samples)))
+ {
+ xp_col = ((x_col >> 4) << 4);
+ yp_col = ((y_col >> 4) << 4);
+ col_ctb_x = ps_mv_ctxt->i4_ctb_x + (xp_col >> log2_ctb_size);
+ col_ctb_y = ps_mv_ctxt->i4_ctb_y + (yp_col >> log2_ctb_size);
+ col_ctb_idx = col_ctb_x + (col_ctb_y)*(ps_sps->i2_pic_wd_in_ctb);
+ pu_cnt = ps_mv_buf_col->pu4_pic_pu_idx[col_ctb_idx];
+ pu1_pic_pu_map_ctb = ps_mv_buf_col->pu1_pic_pu_map
+ + col_ctb_idx * num_minpu_in_ctb;
+ if(xp_col == ctb_size)
+ xp_col = 0;
+ pu_cnt += pu1_pic_pu_map_ctb[(yp_col >> 2)
+ * (ctb_size / MIN_PU_SIZE) + (xp_col >> 2)];
+ ps_col_pu = &ps_mv_buf_col->ps_pic_pu[pu_cnt];
+ }
+ else
+ avail_col = 0;
+
+ if((avail_col == 0) || (ps_col_pu->b1_intra_flag == 1)
+ || (ps_slice_hdr->i1_slice_temporal_mvp_enable_flag == 0))
+ {
+ pu4_avail_col_flag[0] = 0;
+ pu4_avail_col_flag[1] = 0;
+ ps_mv_col[0].i2_mvx = 0;
+ ps_mv_col[0].i2_mvy = 0;
+ ps_mv_col[1].i2_mvx = 0;
+ ps_mv_col[1].i2_mvy = 0;
+ }
+ else
+ {
+ WORD32 au4_ref_idx_col[2];
+ WORD32 pred_flag_l0, pred_flag_l1;
+ pred_flag_l0 = (ps_col_pu->b2_pred_mode != PRED_L1);
+ pred_flag_l1 = (ps_col_pu->b2_pred_mode != PRED_L0);
+
+ if(pred_flag_l0 == 0)
+ {
+ as_mv_col[0] = ps_col_pu->mv.s_l1_mv;
+ au4_ref_idx_col[0] = ps_col_pu->mv.i1_l1_ref_idx;
+ au4_list_col[0] = 1; /* L1 */
+
+ as_mv_col[1] = ps_col_pu->mv.s_l1_mv;
+ au4_ref_idx_col[1] = ps_col_pu->mv.i1_l1_ref_idx;
+ au4_list_col[1] = 1; /* L1 */
+ }
+ else
+ {
+ if(pred_flag_l1 == 0)
+ {
+ as_mv_col[0] = ps_col_pu->mv.s_l0_mv;
+ au4_ref_idx_col[0] = ps_col_pu->mv.i1_l0_ref_idx;
+ au4_list_col[0] = 0; /* L1 */
+
+ as_mv_col[1] = ps_col_pu->mv.s_l0_mv;
+ au4_ref_idx_col[1] = ps_col_pu->mv.i1_l0_ref_idx;
+ au4_list_col[1] = 0; /* L1 */
+ }
+ else
+ {
+ if(1 == ps_slice_hdr->i1_low_delay_flag)
+ {
+ as_mv_col[0] = ps_col_pu->mv.s_l0_mv;
+ au4_ref_idx_col[0] = ps_col_pu->mv.i1_l0_ref_idx;
+ au4_list_col[0] = 0; /* L0 */
+
+ as_mv_col[1] = ps_col_pu->mv.s_l1_mv;
+ au4_ref_idx_col[1] = ps_col_pu->mv.i1_l1_ref_idx;
+ au4_list_col[1] = 1; /* L1 */
+ }
+ else
+ {
+ if(0 == ps_slice_hdr->i1_collocated_from_l0_flag)
+ {
+ as_mv_col[0] = ps_col_pu->mv.s_l0_mv;
+ au4_ref_idx_col[0] = ps_col_pu->mv.i1_l0_ref_idx;
+
+ as_mv_col[1] = ps_col_pu->mv.s_l0_mv;
+ au4_ref_idx_col[1] = ps_col_pu->mv.i1_l0_ref_idx;
+ }
+ else
+ {
+ as_mv_col[0] = ps_col_pu->mv.s_l1_mv;
+ au4_ref_idx_col[0] = ps_col_pu->mv.i1_l1_ref_idx;
+
+ as_mv_col[1] = ps_col_pu->mv.s_l1_mv;
+ au4_ref_idx_col[1] = ps_col_pu->mv.i1_l1_ref_idx;
+ }
+
+ au4_list_col[0] = ps_slice_hdr->i1_collocated_from_l0_flag; /* L"collocated_from_l0_flag" */
+ au4_list_col[1] = ps_slice_hdr->i1_collocated_from_l0_flag; /* L"collocated_from_l0_flag" */
+ }
+ }
+ }
+ avail_col = 1;
+ {
+ WORD32 cur_poc, col_poc, col_ref_poc_l0, cur_ref_poc;
+ WORD32 col_ref_poc_l0_lt, cur_ref_poc_lt;
+ WORD32 ref_idx_l0, ref_idx_l1;
+ WORD32 slice_idx;
+ pic_buf_t *ps_pic_buf;
+
+ if(use_pu_ref_idx)
+ {
+ ref_idx_l0 = ps_pu->mv.i1_l0_ref_idx;
+ ref_idx_l1 = ps_pu->mv.i1_l1_ref_idx;
+ }
+ else
+ {
+ ref_idx_l0 = 0;
+ ref_idx_l1 = 0;
+ }
+
+ col_poc = ps_mv_buf_col->i4_abs_poc;
+ cur_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+
+ slice_idx = *(ps_mv_buf_col->pu1_pic_slice_map + col_ctb_x + col_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+
+ if(au4_list_col[0] == 0)
+ {
+ col_ref_poc_l0 =
+ ps_mv_buf_col->l0_collocated_poc[slice_idx][au4_ref_idx_col[0]];
+ col_ref_poc_l0_lt =
+ (ps_mv_buf_col->u1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
+ }
+ else
+ {
+ col_ref_poc_l0 =
+ ps_mv_buf_col->l1_collocated_poc[slice_idx][au4_ref_idx_col[0]];
+ col_ref_poc_l0_lt =
+ (ps_mv_buf_col->u1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[0]] == LONG_TERM_REF);
+ }
+ /* L0 collocated mv */
+ ps_pic_buf = (pic_buf_t *)((ps_ref_list[0][ref_idx_l0].pv_pic_buf));
+ cur_ref_poc = ps_pic_buf->i4_abs_poc;
+ cur_ref_poc_lt = (ps_pic_buf->u1_used_as_ref == LONG_TERM_REF);
+
+ if(cur_ref_poc_lt == col_ref_poc_l0_lt)
+ {
+ pu4_avail_col_flag[0] = 1;
+
+ if(cur_ref_poc_lt || ((col_poc - col_ref_poc_l0) == (cur_poc - cur_ref_poc)))
+ {
+ ps_mv_col[0] = as_mv_col[0];
+ }
+ else
+ {
+ ps_mv_col[0] = as_mv_col[0];
+ if(col_ref_poc_l0 != col_poc)
+ ihevcd_scale_collocated_mv((mv_t *)(&ps_mv_col[0]), cur_ref_poc,
+ col_ref_poc_l0, col_poc, cur_poc);
+ }
+ }
+ else
+ {
+ pu4_avail_col_flag[0] = 0;
+ ps_mv_col[0].i2_mvx = 0;
+ ps_mv_col[0].i2_mvy = 0;
+ }
+ if((BSLICE == ps_slice_hdr->i1_slice_type))
+ {
+ WORD32 col_ref_poc_l1_lt, col_ref_poc_l1;
+
+ if(au4_list_col[1] == 0)
+ {
+ col_ref_poc_l1 =
+ ps_mv_buf_col->l0_collocated_poc[slice_idx][au4_ref_idx_col[1]];
+ col_ref_poc_l1_lt =
+ (ps_mv_buf_col->u1_l0_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
+ }
+ else
+ {
+ col_ref_poc_l1 =
+ ps_mv_buf_col->l1_collocated_poc[slice_idx][au4_ref_idx_col[1]];
+ col_ref_poc_l1_lt =
+ (ps_mv_buf_col->u1_l1_collocated_poc_lt[slice_idx][au4_ref_idx_col[1]] == LONG_TERM_REF);
+ }
+
+ /* L1 collocated mv */
+ ps_pic_buf = (pic_buf_t *)((ps_ref_list[1][ref_idx_l1].pv_pic_buf));
+ cur_ref_poc = ps_pic_buf->i4_abs_poc;
+ cur_ref_poc_lt = (ps_pic_buf->u1_used_as_ref == LONG_TERM_REF);
+
+ if(cur_ref_poc_lt == col_ref_poc_l1_lt)
+ {
+ pu4_avail_col_flag[1] = 1;
+
+ if(cur_ref_poc_lt || ((col_poc - col_ref_poc_l1) == (cur_poc - cur_ref_poc)))
+ {
+ ps_mv_col[1] = as_mv_col[1];
+ }
+ else
+ {
+ ps_mv_col[1] = as_mv_col[1];
+ if(col_ref_poc_l1 != col_poc)
+ ihevcd_scale_collocated_mv((mv_t *)&ps_mv_col[1], cur_ref_poc,
+ col_ref_poc_l1, col_poc, cur_poc);
+ }
+ }
+ else
+ {
+ pu4_avail_col_flag[1] = 0;
+ ps_mv_col[1].i2_mvx = 0;
+ ps_mv_col[1].i2_mvy = 0;
+ }
+ }
+ else
+ {
+ pu4_avail_col_flag[1] = 0;
+ }
+ }
+ }
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Motion Vector Merge candidates derivation
+ *
+ * @par Description:
+ * MV merge list is computed using neighbor mvs and colocated mv
+ *
+ * @param[in] ps_ctxt
+ * pointer to mv predictor context
+ *
+ * @param[in] ps_top_nbr_4x4
+ * pointer to top 4x4 nbr structure
+ *
+ * @param[in] ps_left_nbr_4x4
+ * pointer to left 4x4 nbr structure
+ *
+ * @param[in] ps_top_left_nbr_4x4
+ * pointer to top left 4x4 nbr structure
+ *
+ * @param[in] left_nbr_4x4_strd
+ * left nbr buffer stride in terms of 4x4 units
+ *
+ * @param[in] ps_avail_flags
+ * Neighbor availability flags container
+ *
+ * @param[in] ps_col_mv
+ * Colocated MV pointer
+ *
+ * @param[in] ps_pu
+ * Current Partition PU strucrture pointer
+ *
+ * @param[in] part_mode
+ * Partition mode @sa PART_SIZE_E
+ *
+ * @param[in] part_idx
+ * Partition idx of current partition inside CU
+ *
+ * @param[in] single_mcl_flag
+ * Single MCL flag based on 8x8 CU and Parallel merge value
+ *
+ * @param[out] ps_merge_cand_list
+ * pointer to store MV merge candidates list
+ *
+ * @returns
+ * None
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+void ihevcd_mv_merge(mv_ctxt_t *ps_mv_ctxt,
+ UWORD32 *pu4_top_pu_idx,
+ UWORD32 *pu4_left_pu_idx,
+ WORD32 left_nbr_4x4_strd,
+ pu_t *ps_pu,
+ WORD32 part_mode,
+ WORD32 part_idx,
+ WORD32 part_wd,
+ WORD32 part_ht,
+ WORD32 part_pos_x,
+ WORD32 part_pos_y,
+ WORD32 single_mcl_flag,
+ WORD32 lb_avail,
+ WORD32 l_avail,
+ WORD32 tr_avail,
+ WORD32 t_avail,
+ WORD32 tl_avail)
+{
+ /******************************************************/
+ /* Spatial Merge Candidates */
+ /******************************************************/
+ slice_header_t *ps_slice_hdr;
+ pu_t as_pu_merge_list[MAX_NUM_MERGE_CAND];
+ pps_t *ps_pps;
+ ref_list_t *ps_ref_list[2];
+ WORD32 sum_avail_a0_a1_b0_b1 = 0; /*Sum of availability of A0, A1, B0, B1*/
+ WORD32 nbr_x, nbr_y;
+ WORD32 nbr_avail[MAX_NUM_MV_NBR];
+ WORD32 merge_shift;
+ WORD32 nbr_pu_idx;
+ pu_t *ps_nbr_pu[MAX_NUM_MV_NBR];
+ WORD32 max_num_merge_cand;
+ WORD32 candidate_cnt;
+ WORD32 pos_x_merge_shift, pos_y_merge_shift;
+
+ ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+ ps_pps = ps_mv_ctxt->ps_pps;
+ /* Initializing reference list */
+ ps_ref_list[0] = ps_slice_hdr->as_ref_pic_list0;
+ ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list1;
+ if(PSLICE == ps_slice_hdr->i1_slice_type)
+ ps_ref_list[1] = ps_slice_hdr->as_ref_pic_list0;
+
+ candidate_cnt = 0;
+ /*******************************************/
+ /* Neighbor location: Graphical indication */
+ /* */
+ /* B2 _____________B1 B0 */
+ /* | | */
+ /* | | */
+ /* | | */
+ /* | PU ht| */
+ /* | | */
+ /* | | */
+ /* A1|______wd_______| */
+ /* A0 */
+ /* */
+ /*******************************************/
+
+ merge_shift = ps_pps->i1_log2_parallel_merge_level;
+
+ /* Availability check */
+ /* A1 */
+ nbr_x = part_pos_x - 1;
+ nbr_y = part_pos_y + part_ht - 1; /* A1 */
+
+ nbr_pu_idx = *(pu4_left_pu_idx + ((nbr_y - part_pos_y) >> 2) * left_nbr_4x4_strd);
+ ps_nbr_pu[NBR_A1] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+ nbr_avail[NBR_A1] = l_avail
+ && (!ps_nbr_pu[NBR_A1]->b1_intra_flag); /* A1 */
+
+ pos_x_merge_shift = (part_pos_x >> merge_shift);
+ pos_y_merge_shift = (part_pos_y >> merge_shift);
+ max_num_merge_cand = ps_pu->b3_merge_idx + 1;
+
+ {
+ if(nbr_avail[NBR_A1])
+ {
+ /* if at same merge level */
+ if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+ (pos_y_merge_shift == (nbr_y >> merge_shift)))
+ {
+ nbr_avail[NBR_A1] = 0;
+ }
+
+ /* SPEC JCTVC-K1003_v9 version has a different way using not available */
+ /* candidates compared to software. for non square part and seconf part case */
+ /* ideally nothing from the 1st partition should be used as per spec but */
+ /* HM 8.2 dev verison does not adhere to this. currenlty code fllows HM */
+
+ /* if single MCL is 0 , second part of 2 part in CU */
+ else if((single_mcl_flag == 0) && (part_idx == 1) &&
+ ((part_mode == PART_Nx2N) || (part_mode == PART_nLx2N) ||
+ (part_mode == PART_nRx2N)))
+ {
+ nbr_avail[NBR_A1] = 0;
+ }
+ sum_avail_a0_a1_b0_b1 += nbr_avail[NBR_A1];
+ if(nbr_avail[NBR_A1])
+ {
+ as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_A1];
+ candidate_cnt++;
+ if(candidate_cnt == max_num_merge_cand)
+ {
+ ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+ ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+ return;
+ }
+ }
+ }
+ }
+
+ /* B1 */
+ nbr_x = part_pos_x + part_wd - 1;
+ nbr_y = part_pos_y - 1;
+
+ nbr_pu_idx = *(pu4_top_pu_idx + ((nbr_x - part_pos_x) >> 2));
+ ps_nbr_pu[NBR_B1] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+ nbr_avail[NBR_B1] = t_avail
+ && (!ps_nbr_pu[NBR_B1]->b1_intra_flag); /* B1 */
+
+ {
+ WORD32 avail_flag;
+ avail_flag = nbr_avail[NBR_B1];
+
+ if(nbr_avail[NBR_B1])
+ {
+ /* if at same merge level */
+ if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+ (pos_y_merge_shift == (nbr_y >> merge_shift)))
+ {
+ nbr_avail[NBR_B1] = 0;
+ avail_flag = 0;
+ }
+
+ /* if single MCL is 0 , second part of 2 part in CU */
+ else if((single_mcl_flag == 0) && (part_idx == 1) &&
+ ((part_mode == PART_2NxN) || (part_mode == PART_2NxnU) ||
+ (part_mode == PART_2NxnD)))
+ {
+ nbr_avail[NBR_B1] = 0;
+ avail_flag = 0;
+ }
+
+ else if(nbr_avail[NBR_A1])
+ {
+ avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_A1], ps_nbr_pu[NBR_B1]);
+ }
+
+ sum_avail_a0_a1_b0_b1 += avail_flag;
+ if(avail_flag)
+ {
+ as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_B1];
+ candidate_cnt++;
+ if(candidate_cnt == max_num_merge_cand)
+ {
+ ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+ ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+ return;
+ }
+ }
+ }
+ }
+ /* B0 */
+ nbr_x = part_pos_x + part_wd;
+ nbr_y = part_pos_y - 1;
+
+ nbr_pu_idx = *(pu4_top_pu_idx + ((nbr_x - part_pos_x) >> 2));
+ ps_nbr_pu[NBR_B0] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+ nbr_avail[NBR_B0] = tr_avail
+ && (!ps_nbr_pu[NBR_B0]->b1_intra_flag); /* B0 */
+
+ {
+ WORD32 avail_flag;
+ avail_flag = nbr_avail[NBR_B0];
+
+ /* if at same merge level */
+ if(nbr_avail[NBR_B0])
+ {
+ if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+ (pos_y_merge_shift == (nbr_y >> merge_shift)))
+ {
+ nbr_avail[NBR_B0] = 0;
+ avail_flag = 0;
+ }
+ else if(nbr_avail[NBR_B1])
+ {
+ avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_B1], ps_nbr_pu[NBR_B0]);
+ }
+
+ sum_avail_a0_a1_b0_b1 += avail_flag;
+ if(avail_flag)
+ {
+ as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_B0];
+ candidate_cnt++;
+ if(candidate_cnt == max_num_merge_cand)
+ {
+ ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+ ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+ return;
+ }
+ }
+ }
+ }
+ /* A0 */
+ nbr_x = part_pos_x - 1;
+ nbr_y = part_pos_y + part_ht; /* A0 */
+
+ nbr_pu_idx = *(pu4_left_pu_idx + ((nbr_y - part_pos_y) >> 2) * left_nbr_4x4_strd);
+ ps_nbr_pu[NBR_A0] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+ nbr_avail[NBR_A0] = lb_avail
+ && (!ps_nbr_pu[NBR_A0]->b1_intra_flag); /* A0 */
+ {
+ WORD32 avail_flag;
+ avail_flag = nbr_avail[NBR_A0];
+
+ if(nbr_avail[NBR_A0])
+ {
+ /* if at same merge level */
+ if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+ (pos_y_merge_shift == (nbr_y >> merge_shift)))
+ {
+ nbr_avail[NBR_A0] = 0;
+ avail_flag = 0;
+ }
+ else if(nbr_avail[NBR_A1])
+ {
+ avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_A1], ps_nbr_pu[NBR_A0]);
+ }
+
+ sum_avail_a0_a1_b0_b1 += avail_flag;
+ if(avail_flag)
+ {
+ as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_A0];
+ candidate_cnt++;
+ if(candidate_cnt == max_num_merge_cand)
+ {
+ ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+ ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+ return;
+ }
+ }
+ }
+ }
+ /* B2 */
+
+ nbr_x = part_pos_x - 1;
+ nbr_y = part_pos_y - 1; /* B2 */
+
+ nbr_pu_idx = *(pu4_top_pu_idx + ((nbr_x - part_pos_x) >> 2));
+ ps_nbr_pu[NBR_B2] = ps_mv_ctxt->ps_pic_pu + nbr_pu_idx;
+
+ nbr_avail[NBR_B2] = tl_avail
+ && (!ps_nbr_pu[NBR_B2]->b1_intra_flag); /* B2 */
+
+ {
+ WORD32 avail_flag;
+ avail_flag = nbr_avail[NBR_B2];
+
+ if(nbr_avail[NBR_B2])
+ {
+ /* if at same merge level */
+ if(pos_x_merge_shift == (nbr_x >> merge_shift) &&
+ (pos_y_merge_shift == (nbr_y >> merge_shift)))
+ {
+ nbr_avail[NBR_B2] = 0;
+ avail_flag = 0;
+ }
+ else if(4 == sum_avail_a0_a1_b0_b1)
+ {
+ avail_flag = 0;
+ }
+
+ else
+ {
+ if(nbr_avail[NBR_A1])
+ {
+ avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_A1], ps_nbr_pu[NBR_B2]);
+ }
+
+ if(avail_flag && nbr_avail[NBR_B1])
+ {
+ avail_flag = !ihevcd_compare_pu_t(ps_nbr_pu[NBR_B1], ps_nbr_pu[NBR_B2]);
+ }
+ }
+
+ if(avail_flag)
+ {
+ as_pu_merge_list[candidate_cnt] = *ps_nbr_pu[NBR_B2];
+ candidate_cnt++;
+ if(candidate_cnt == max_num_merge_cand)
+ {
+ ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+ ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+ return;
+ }
+ }
+ }
+ }
+
+ /***********************************************************/
+ /* Collocated MV prediction */
+ /***********************************************************/
+#if 1
+ {
+ mv_t as_mv_col[2];
+ WORD32 avail_col_flag[2] = { 0 }, x_col, y_col;
+ WORD32 avail_col_l0, avail_col_l1;
+// ihevcd_collocated_mvp(ps_mv_ctxt,ps_pu,part_pos_x,part_pos_y,part_wd,part_ht,as_mv_col,avail_col_flag,0);
+
+ /* Checking Collocated MV availability at Bottom right of PU*/
+ x_col = part_pos_x + part_wd;
+ y_col = part_pos_y + part_ht;
+ ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 0, x_col, y_col);
+
+ avail_col_l0 = avail_col_flag[0];
+ avail_col_l1 = avail_col_flag[1];
+
+ if(avail_col_l0 || avail_col_l1)
+ {
+ as_pu_merge_list[candidate_cnt].mv.s_l0_mv = as_mv_col[0];
+ as_pu_merge_list[candidate_cnt].mv.s_l1_mv = as_mv_col[1];
+ }
+
+ if(avail_col_l0 == 0 || avail_col_l1 == 0)
+ {
+ /* Checking Collocated MV availability at Center of PU */
+ x_col = part_pos_x + (part_wd >> 1);
+ y_col = part_pos_y + (part_ht >> 1);
+ ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 0, x_col, y_col);
+
+ if(avail_col_l0 == 0)
+ {
+ as_pu_merge_list[candidate_cnt].mv.s_l0_mv = as_mv_col[0];
+ }
+ if(avail_col_l1 == 0)
+ {
+ as_pu_merge_list[candidate_cnt].mv.s_l1_mv = as_mv_col[1];
+ }
+
+ avail_col_l0 |= avail_col_flag[0];
+ avail_col_l1 |= avail_col_flag[1];
+ }
+
+ as_pu_merge_list[candidate_cnt].mv.i1_l0_ref_idx = 0;
+ as_pu_merge_list[candidate_cnt].mv.i1_l1_ref_idx = 0;
+ as_pu_merge_list[candidate_cnt].b2_pred_mode = avail_col_l0 ? (avail_col_l1 ? PRED_BI : PRED_L0) : PRED_L1;
+
+ candidate_cnt += (avail_col_l0 || avail_col_l1);
+
+ if(candidate_cnt == max_num_merge_cand)
+ {
+ ps_pu[0].mv = as_pu_merge_list[candidate_cnt - 1].mv;
+ ps_pu[0].b2_pred_mode = as_pu_merge_list[candidate_cnt - 1].b2_pred_mode;
+ return;
+ }
+
+ }
+#endif
+ {
+ WORD32 slice_type;
+
+ slice_type = ps_slice_hdr->i1_slice_type;
+ /* Colocated mv has to be added to list, if available */
+
+ /******************************************************/
+ /* Bi pred merge candidates */
+ /******************************************************/
+ if(slice_type == BSLICE)
+ {
+ if((candidate_cnt > 1) && (candidate_cnt < MAX_NUM_MERGE_CAND))
+ {
+ WORD32 priority_list0[12] =
+ { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
+ WORD32 priority_list1[12] =
+ { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
+ WORD32 l0_cand, l1_cand;
+ WORD32 bi_pred_idx = 0;
+ WORD32 total_bi_pred_cand =
+ candidate_cnt * (candidate_cnt - 1);
+
+ while(bi_pred_idx < total_bi_pred_cand)
+ {
+ l0_cand = priority_list0[bi_pred_idx];
+ l1_cand = priority_list1[bi_pred_idx];
+
+ if((as_pu_merge_list[l0_cand].b2_pred_mode != PRED_L1)
+ && (as_pu_merge_list[l1_cand].b2_pred_mode
+ != PRED_L0))
+ {
+ WORD8 i1_l0_ref_idx, i1_l1_ref_idx;
+ mv_t s_l0_mv, s_l1_mv;
+ pic_buf_t *ps_pic_buf_l0, *ps_pic_buf_l1;
+
+ i1_l0_ref_idx = as_pu_merge_list[l0_cand].mv.i1_l0_ref_idx;
+ i1_l1_ref_idx = as_pu_merge_list[l1_cand].mv.i1_l1_ref_idx;
+ ps_pic_buf_l0 = (pic_buf_t *)((ps_ref_list[0][i1_l0_ref_idx].pv_pic_buf));
+ ps_pic_buf_l1 = (pic_buf_t *)((ps_ref_list[1][i1_l1_ref_idx].pv_pic_buf));
+ s_l0_mv = as_pu_merge_list[l0_cand].mv.s_l0_mv;
+ s_l1_mv = as_pu_merge_list[l1_cand].mv.s_l1_mv;
+
+ if((ps_pic_buf_l0->i4_abs_poc != ps_pic_buf_l1->i4_abs_poc)
+ || (s_l0_mv.i2_mvx != s_l1_mv.i2_mvx)
+ || (s_l0_mv.i2_mvy != s_l1_mv.i2_mvy))
+ {
+ candidate_cnt++;
+ if(candidate_cnt == max_num_merge_cand)
+ {
+ ps_pu[0].mv.s_l0_mv = s_l0_mv;
+ ps_pu[0].mv.s_l1_mv = s_l1_mv;
+ ps_pu[0].mv.i1_l0_ref_idx = i1_l0_ref_idx;
+ ps_pu[0].mv.i1_l1_ref_idx = i1_l1_ref_idx;
+ ps_pu[0].b2_pred_mode = PRED_BI;
+ return;
+ }
+ }
+ }
+
+ bi_pred_idx++;
+
+ if((bi_pred_idx == total_bi_pred_cand)
+ || (candidate_cnt == MAX_NUM_MERGE_CAND))
+ {
+ break;
+ }
+ }
+ }
+ }
+
+ /******************************************************/
+ /* Zero merge candidates */
+ /******************************************************/
+// if(candidate_cnt < max_num_merge_cand)
+ {
+ WORD32 num_ref_idx;
+ WORD32 zero_idx;
+
+ zero_idx = max_num_merge_cand - candidate_cnt - 1;
+
+ if(slice_type == PSLICE)
+ num_ref_idx = ps_slice_hdr->i1_num_ref_idx_l0_active;
+ else
+ /* Slice type B */
+ num_ref_idx = MIN(ps_slice_hdr->i1_num_ref_idx_l0_active, ps_slice_hdr->i1_num_ref_idx_l1_active);
+
+ if(zero_idx >= num_ref_idx)
+ zero_idx = 0;
+
+ ps_pu[0].mv.i1_l0_ref_idx = zero_idx;
+ if(slice_type == PSLICE)
+ {
+ ps_pu[0].mv.i1_l1_ref_idx = 0;
+ ps_pu[0].b2_pred_mode = PRED_L0;
+ }
+ else /* Slice type B */
+ {
+ ps_pu[0].mv.i1_l1_ref_idx = zero_idx;
+ ps_pu[0].b2_pred_mode = PRED_BI;
+ }
+
+ ps_pu[0].mv.s_l0_mv.i2_mvx = 0;
+ ps_pu[0].mv.s_l0_mv.i2_mvy = 0;
+ ps_pu[0].mv.s_l1_mv.i2_mvx = 0;
+ ps_pu[0].mv.s_l1_mv.i2_mvy = 0;
+
+ candidate_cnt++;
+ }
+ }
+
+ return;
+}
+
+
diff --git a/decoder/ihevcd_mv_merge.h b/decoder/ihevcd_mv_merge.h
new file mode 100644
index 0000000..52a7e98
--- /dev/null
+++ b/decoder/ihevcd_mv_merge.h
@@ -0,0 +1,111 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevce_mv_merge.h
+*
+* @brief
+* This file contains function prototypes of MV Merge candidates list
+* derivation functions and corresponding structure and macrso definations
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_MV_MERGE_H_
+#define _IHEVCD_MV_MERGE_H_
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+#define MAX_NUM_MV_NBR 5
+
+/*****************************************************************************/
+/* Function Macros */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Typedefs */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+typedef enum
+{
+ NBR_A0 = 0,
+ NBR_A1 = 1,
+ NBR_B0 = 2,
+ NBR_B1 = 3,
+ NBR_B2 = 4,
+
+ /* should be last */
+ MAX_NUM_NBRS
+}MV_MERGE_NBRS_T;
+
+/*****************************************************************************/
+/* Structure */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Extern Variable Declarations */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Extern Function Declarations */
+/*****************************************************************************/
+void ihevcd_mv_merge(mv_ctxt_t *ps_mv_ctxt,
+ UWORD32 *pu4_top_pu_idx,
+ UWORD32 *pu4_left_pu_idx,
+ WORD32 left_nbr_4x4_strd,
+ pu_t *ps_pu,
+ WORD32 part_mode,
+ WORD32 part_idx,
+ WORD32 part_wd,
+ WORD32 part_ht,
+ WORD32 part_pos_x,
+ WORD32 part_pos_y,
+ WORD32 single_mcl_flag,
+ WORD32 lb_avail,
+ WORD32 l_avail,
+ WORD32 tr_avail,
+ WORD32 t_avail,
+ WORD32 tl_avail);
+void ihevcd_collocated_mvp(mv_ctxt_t *ps_mv_ctxt,
+ pu_t *ps_pu,
+ mv_t *ps_mv_col,
+ WORD32 *pu4_avail_col_flag,
+ WORD32 use_pu_ref_idx,
+ WORD32 x_col,
+ WORD32 y_col);
+
+void ihevcd_scale_collocated_mv(mv_t *ps_mv,
+ WORD32 cur_ref_poc,
+ WORD32 col_ref_poc,
+ WORD32 col_poc,
+ WORD32 cur_poc);
+#endif /* _IHEVCD_MV_MERGE_H_ */
diff --git a/decoder/ihevcd_mv_pred.c b/decoder/ihevcd_mv_pred.c
new file mode 100644
index 0000000..e811198
--- /dev/null
+++ b/decoder/ihevcd_mv_pred.c
@@ -0,0 +1,874 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_mv_pred.c
+ *
+ * @brief
+ * Contains functions for motion vector prediction
+ *
+ * @author
+ * Ittiam
+ *
+ * @par List of Functions:
+ * - ihevcd_scale_mv()
+ * - ihevcd_mv_pred()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_fmt_conv.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_mv_merge.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief Function scaling motion vector
+ *
+ *
+ * @par Description:
+ * Scales mv based on difference between current POC and current
+ * reference POC and neighbour reference poc
+ *
+ * @param[inout] mv
+ * motion vector to be scaled
+ *
+ * @param[in] cur_ref_poc
+ * Current PU refernce pic poc
+ *
+ * @param[in] nbr_ref_poc
+ * Neighbor PU reference pic poc
+ *
+ * @param[in] cur_poc
+ * Picture order count of current pic
+ *
+ * @returns
+ * None
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+void ihevcd_scale_mv(mv_t *ps_mv,
+ WORD32 cur_ref_poc,
+ WORD32 nbr_ref_poc,
+ WORD32 cur_poc)
+{
+ WORD32 td, tb, tx;
+ WORD32 dist_scale_factor;
+ WORD32 mvx, mvy;
+
+ td = CLIP_S8(cur_poc - nbr_ref_poc);
+ tb = CLIP_S8(cur_poc - cur_ref_poc);
+
+ if(0 != td)
+ {
+ tx = (16384 + (abs(td) >> 1)) / td;
+
+ dist_scale_factor = (tb * tx + 32) >> 6;
+ dist_scale_factor = CLIP3(dist_scale_factor, -4096, 4095);
+
+ mvx = ps_mv->i2_mvx;
+ mvy = ps_mv->i2_mvy;
+
+ mvx = SIGN(dist_scale_factor * mvx)
+ * ((abs(dist_scale_factor * mvx) + 127) >> 8);
+ mvy = SIGN(dist_scale_factor * mvy)
+ * ((abs(dist_scale_factor * mvy) + 127) >> 8);
+
+ ps_mv->i2_mvx = CLIP_S16(mvx);
+ ps_mv->i2_mvy = CLIP_S16(mvy);
+ }
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief Function scaling temporal motion vector
+ *
+ *
+ * @par Description:
+ * Scales mv based on difference between current POC and current
+ * reference POC and neighbour reference poc
+ *
+ * @param[inout] mv
+ * motion vector to be scaled
+ *
+ * @param[in] cur_ref_poc
+ * Current PU refernce pic poc
+ *
+ * @param[in] nbr_ref_poc
+ * Neighbor PU reference pic poc
+ *
+ * @param[in] cur_poc
+ * Picture order count of current pic
+ *
+ * @returns
+ * None
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+void ihevcd_scale_collocated_mv(mv_t *ps_mv,
+ WORD32 cur_ref_poc,
+ WORD32 col_ref_poc,
+ WORD32 col_poc,
+ WORD32 cur_poc)
+{
+ WORD32 td, tb, tx;
+ WORD32 dist_scale_factor;
+ WORD32 mvx, mvy;
+
+ td = CLIP_S8(col_poc - col_ref_poc);
+ tb = CLIP_S8(cur_poc - cur_ref_poc);
+
+ tx = (16384 + (abs(td) >> 1)) / td;
+
+ dist_scale_factor = (tb * tx + 32) >> 6;
+ dist_scale_factor = CLIP3(dist_scale_factor, -4096, 4095);
+
+ mvx = ps_mv->i2_mvx;
+ mvy = ps_mv->i2_mvy;
+
+ mvx = SIGN(dist_scale_factor * mvx)
+ * ((abs(dist_scale_factor * mvx) + 127) >> 8);
+ mvy = SIGN(dist_scale_factor * mvy)
+ * ((abs(dist_scale_factor * mvy) + 127) >> 8);
+
+ ps_mv->i2_mvx = CLIP_S16(mvx);
+ ps_mv->i2_mvy = CLIP_S16(mvy);
+}
+
+#if 1
+#define CHECK_NBR_MV_ST(pi4_avail_flag, cur_ref_poc, u1_nbr_pred_flag, nbr_ref_poc, \
+ ps_mv, ps_nbr_mv ) \
+{ \
+ if((u1_nbr_pred_flag) && (cur_ref_poc == nbr_ref_poc)) \
+ { \
+ *pi4_avail_flag = 1; \
+ *ps_mv = *ps_nbr_mv; \
+ break ; \
+ } \
+}
+#define CHECK_NBR_MV_LT(pi4_avail_flag, u1_cur_ref_lt, cur_poc, cur_ref_poc, \
+ u1_nbr_pred_flag, u1_nbr_ref_lt, nbr_ref_poc, \
+ ps_mv, ps_nbr_mv ) \
+{ \
+ WORD32 cur_lt, nbr_lt; \
+ cur_lt = (LONG_TERM_REF == (u1_cur_ref_lt)); \
+ nbr_lt = (LONG_TERM_REF == (u1_nbr_ref_lt)); \
+ if((u1_nbr_pred_flag) && (cur_lt == nbr_lt)) \
+ { \
+ *pi4_avail_flag = 1; \
+ *ps_mv = *ps_nbr_mv; \
+ if(SHORT_TERM_REF == u1_nbr_ref_lt) \
+ { \
+ ihevcd_scale_mv(ps_mv, cur_ref_poc, nbr_ref_poc, \
+ cur_poc); \
+ } \
+ break ; \
+ } \
+}
+
+#else
+
+void CHECK_NBR_MV_ST(WORD32 *pi4_avail_flag, WORD32 cur_ref_poc, UWORD8 u1_nbr_pred_flag, WORD32 nbr_ref_poc,
+ mv_t *ps_mv, mv_t *ps_nbr_mv )
+{
+ if((u1_nbr_pred_flag) && (cur_ref_poc == nbr_ref_poc))
+ {
+ *pi4_avail_flag = 1;
+ *ps_mv = *ps_nbr_mv;
+ }
+}
+void CHECK_NBR_MV_LT(WORD32 *pi4_avail_flag, UWORD8 u1_cur_ref_lt, WORD32 cur_poc, WORD32 cur_ref_poc,
+ UWORD8 u1_nbr_pred_flag, UWORD8 u1_nbr_ref_lt, WORD32 nbr_ref_poc,
+ mv_t *ps_mv, mv_t *ps_nbr_mv )
+{
+ WORD32 cur_lt, nbr_lt;
+ cur_lt = (LONG_TERM_REF == u1_cur_ref_lt);
+ nbr_lt = (LONG_TERM_REF == u1_nbr_ref_lt);
+
+ if((u1_nbr_pred_flag) && (cur_lt == nbr_lt))
+ {
+ *pi4_avail_flag = 1;
+ *ps_mv = *ps_nbr_mv;
+ if(SHORT_TERM_REF == u1_nbr_ref_lt)
+ {
+ ihevcd_scale_mv(ps_mv, cur_ref_poc, nbr_ref_poc,
+ cur_poc);
+ }
+ }
+}
+#endif
+
+#if 1
+void GET_MV_NBR_ST(ref_list_t **ps_ref_pic_list, WORD32 *pi4_avail_flag, pic_buf_t *ps_cur_pic_buf_lx, pu_t **aps_nbr_pu, mv_t *ps_mv, WORD32 num_nbrs, WORD32 lx)
+{
+ WORD32 i, nbr_pred_lx;
+ pic_buf_t *ps_nbr_pic_buf_lx;
+ /* Short Term */
+ /* L0 */
+ if(0 == lx)
+ {
+ for(i = 0; i < num_nbrs; i++)
+ {
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+ ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+ ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+ }
+ }
+ /* L1 */
+ else
+ {
+ for(i = 0; i < num_nbrs; i++)
+ {
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+ ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx,
+ ps_nbr_pic_buf_lx->i4_abs_poc, ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+ }
+ }
+}
+
+void GET_MV_NBR_LT(ref_list_t **ps_ref_pic_list, slice_header_t *ps_slice_hdr, WORD32 *pi4_avail_flag, pic_buf_t *ps_cur_pic_buf_lx, pu_t **aps_nbr_pu, mv_t *ps_mv, WORD32 num_nbrs, WORD32 lx)
+{
+ WORD32 i, nbr_pred_lx;
+ pic_buf_t *ps_nbr_pic_buf_lx;
+ /* Long Term*/
+ /* L0 */
+ if(0 == lx)
+ {
+ for(i = 0; i < num_nbrs; i++)
+ {
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+ nbr_pred_lx,
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+ ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+ nbr_pred_lx,
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+ ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+ }
+ }
+ /* L1 */
+ else
+ {
+ for(i = 0; i < num_nbrs; i++)
+ {
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+ nbr_pred_lx,
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+ ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv);
+
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode);
+ ps_nbr_pic_buf_lx = (pic_buf_t *)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf));
+ CHECK_NBR_MV_LT(pi4_avail_flag, ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc,
+ nbr_pred_lx,
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc,
+ ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv);
+ }
+ }
+}
+#else
+
+#define GET_MV_NBR_ST(ps_ref_pic_list, pi4_avail_flag, ps_cur_pic_buf_lx, aps_nbr_pu, ps_mv, num_nbrs, lx) \
+{ \
+ WORD32 i, nbr_pred_lx; \
+ pic_buf_t *ps_nbr_pic_buf_lx; \
+ /* Short Term */ \
+ /* L0 */ \
+ if(0 == lx) \
+ { \
+ for(i=0; i< num_nbrs; i++) \
+ { \
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx , \
+ ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv ); \
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode); \
+ \
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx, \
+ ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv ); \
+ } \
+ } \
+ /* L1 */ \
+ else \
+ { \
+ for(i=0; i< num_nbrs; i++) \
+ { \
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx, \
+ ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv ); \
+ \
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_ST(pi4_avail_flag, ps_cur_pic_buf_lx->i4_abs_poc, nbr_pred_lx, \
+ ps_nbr_pic_buf_lx->i4_abs_poc,ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv ); \
+ } \
+ } \
+}
+
+#define GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, pi4_avail_flag, ps_cur_pic_buf_lx, aps_nbr_pu, ps_mv, num_nbrs, lx) \
+{ \
+ WORD32 i, nbr_pred_lx; \
+ pic_buf_t *ps_nbr_pic_buf_lx; \
+ /* Long Term*/ \
+ /* L0 */ \
+ if(0 == lx) \
+ { \
+ for(i=0; i< num_nbrs; i++) \
+ { \
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc, \
+ nbr_pred_lx, \
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc, \
+ ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv); \
+ \
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc, \
+ nbr_pred_lx, \
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc, \
+ ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv); \
+ } \
+ } \
+ /* L1 */ \
+ else \
+ { \
+ for(i=0; i< num_nbrs; i++) \
+ { \
+ nbr_pred_lx = (PRED_L0 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[1][aps_nbr_pu[i]->mv.i1_l1_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc, \
+ nbr_pred_lx, \
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc, \
+ ps_mv, &aps_nbr_pu[i]->mv.s_l1_mv); \
+ \
+ nbr_pred_lx = (PRED_L1 != aps_nbr_pu[i]->b2_pred_mode); \
+ ps_nbr_pic_buf_lx = (pic_buf_t*)((ps_ref_pic_list[0][aps_nbr_pu[i]->mv.i1_l0_ref_idx].pv_pic_buf)); \
+ CHECK_NBR_MV_LT(pi4_avail_flag,ps_cur_pic_buf_lx->u1_used_as_ref, ps_slice_hdr->i4_abs_pic_order_cnt, ps_cur_pic_buf_lx->i4_abs_poc, \
+ nbr_pred_lx, \
+ ps_nbr_pic_buf_lx->u1_used_as_ref, ps_nbr_pic_buf_lx->i4_abs_poc, \
+ ps_mv, &aps_nbr_pu[i]->mv.s_l0_mv); \
+ } \
+ } \
+}
+#endif
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Motion Vector prediction and return a list of mv
+ *
+ * @par Description:
+ * MV predictor list is computed using neighbor mvs and colocated mv
+ *
+ * @param[in] ps_ctxt
+ * pointer to mv predictor context
+ *
+ * @param[in] ps_top_nbr_4x4
+ * pointer to top 4x4 nbr structure
+ *
+ * @param[in] ps_left_nbr_4x4
+ * pointer to left 4x4 nbr structure
+ *
+ * @param[in] ps_top_left_nbr_4x4
+ * pointer to top left 4x4 nbr structure
+ *
+ * @param[in] left_nbr_4x4_strd
+ * left nbr buffer stride in terms of 4x4 units
+ *
+ * @param[in] ps_avail_flags
+ * Neighbor availability flags container
+ *
+ * @param[in] ps_col_mv
+ * Colocated MV pointer
+ *
+ * @param[in] ps_pu
+ * Current Partition PU strucrture pointer
+ *
+ * @param[inout] ps_pred_mv
+ * pointer to store predicted MV list
+ *
+ * @returns
+ * None
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+void ihevcd_mv_pred(mv_ctxt_t *ps_mv_ctxt,
+ UWORD32 *pu4_top_pu_idx,
+ UWORD32 *pu4_left_pu_idx,
+ UWORD32 *pu4_top_left_pu_idx,
+ WORD32 left_nbr_4x4_strd,
+ pu_t *ps_pu,
+ WORD32 lb_avail,
+ WORD32 l_avail,
+ WORD32 tr_avail,
+ WORD32 t_avail,
+ WORD32 tl_avail,
+ pu_mv_t *ps_pred_mv)
+{
+ slice_header_t *ps_slice_hdr;
+ ref_list_t *ps_ref_pic_list[2];
+ pu_t *ps_pic_pu;
+ WORD32 max_l0_mvp_cand, max_l1_mvp_cand;
+ WORD32 l0_done_flag, l1_done_flag;
+ WORD32 num_l0_mvp_cand, num_l1_mvp_cand;
+ WORD32 is_scaled_flag_list /* Indicates whether A0 or A1 is available */;
+ WORD32 avail_a_flag[2];
+ mv_t as_mv_a[2];
+ WORD32 part_pos_x;
+ WORD32 part_pos_y;
+ WORD32 part_wd;
+ WORD32 part_ht;
+ pic_buf_t *ps_cur_pic_buf_l0, *ps_cur_pic_buf_l1;
+ WORD32 nbr_avail[3]; /*[A0/A1] */ /* [B0/B1/B2] */
+ pu_t *aps_nbr_pu[3]; /*[A0/A1] */ /* [B0/B1/B2] */
+ WORD32 num_nbrs = 0;
+
+ /*******************************************/
+ /* Neighbor location: Graphical indication */
+ /* */
+ /* B2 _____________B1 B0 */
+ /* | | */
+ /* | | */
+ /* | | */
+ /* | PU ht| */
+ /* | | */
+ /* | | */
+ /* A1|______wd_______| */
+ /* A0 */
+ /* */
+ /*******************************************/
+
+ ps_slice_hdr = ps_mv_ctxt->ps_slice_hdr;
+ ps_pic_pu = ps_mv_ctxt->ps_pic_pu;
+ max_l0_mvp_cand = ps_pu->b1_l0_mvp_idx + 1;
+ max_l1_mvp_cand = ps_pu->b1_l1_mvp_idx + 1;
+ num_l0_mvp_cand = 0;
+ num_l1_mvp_cand = 0;
+
+ /* Initializing reference list */
+ ps_ref_pic_list[0] = ps_slice_hdr->as_ref_pic_list0;
+ ps_ref_pic_list[1] = ps_slice_hdr->as_ref_pic_list1;
+ if(PSLICE == ps_slice_hdr->i1_slice_type)
+ ps_ref_pic_list[1] = ps_slice_hdr->as_ref_pic_list0;
+
+ ps_cur_pic_buf_l0 = (pic_buf_t *)((ps_ref_pic_list[0][ps_pu->mv.i1_l0_ref_idx].pv_pic_buf));
+ ps_cur_pic_buf_l1 = (pic_buf_t *)((ps_ref_pic_list[1][ps_pu->mv.i1_l1_ref_idx].pv_pic_buf));
+
+ is_scaled_flag_list = 0;
+
+ part_pos_x = ps_pu->b4_pos_x << 2;
+ part_pos_y = ps_pu->b4_pos_y << 2;
+ part_wd = (ps_pu->b4_wd + 1) << 2;
+ part_ht = (ps_pu->b4_ht + 1) << 2;
+
+ /************************************************************/
+ /* Calculating of motion vector A from neighbors A0 and A1 */
+ /************************************************************/
+ {
+ nbr_avail[0] = 0;
+ nbr_avail[1] = 0;
+
+ /* Pointers to A0 and A1 */
+ {
+ WORD32 y_a0, y_a1;
+ WORD32 pu_idx_a0, pu_idx_a1;
+
+ /* TODO: y_a0, y_a1 is coded assuming left nbr pointer starts at PU */
+ y_a0 = (part_ht >> 2);
+ y_a1 = ((part_ht - 1) >> 2);
+
+ pu_idx_a0 = *(pu4_left_pu_idx + (y_a0 * left_nbr_4x4_strd));
+ pu_idx_a1 = *(pu4_left_pu_idx + (y_a1 * left_nbr_4x4_strd));
+
+ if(lb_avail && (!ps_pic_pu[pu_idx_a0].b1_intra_flag))
+ {
+ aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_a0];
+ num_nbrs++;
+ nbr_avail[0] = 1;
+ }
+ if(l_avail && (!ps_pic_pu[pu_idx_a1].b1_intra_flag))
+ {
+ aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_a1];
+ num_nbrs++;
+ nbr_avail[1] = 1;
+ }
+ }
+ /* Setting is scaled flag based on availability of A0 and A1 */
+ if((nbr_avail[0] == 1) || (nbr_avail[1]))
+ {
+ is_scaled_flag_list = 1;
+ }
+
+ avail_a_flag[0] = 0;
+ avail_a_flag[1] = 0;
+
+ /* L0 */
+ GET_MV_NBR_ST(ps_ref_pic_list, &avail_a_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_a[0], num_nbrs, 0);
+ if(0 == avail_a_flag[0])
+ {
+ GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_a_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_a[0], num_nbrs, 0);
+ }
+
+ /* L1 */
+ if(PRED_L0 != ps_pu->b2_pred_mode)
+ {
+ GET_MV_NBR_ST(ps_ref_pic_list, &avail_a_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_a[1], num_nbrs, 1);
+ if(0 == avail_a_flag[1])
+ {
+ GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_a_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_a[1], num_nbrs, 1);
+ }
+ }
+
+ l0_done_flag = (PRED_L1 == ps_pu->b2_pred_mode);
+ l1_done_flag = (PRED_L0 == ps_pu->b2_pred_mode);
+
+ if(avail_a_flag[0])
+ {
+ num_l0_mvp_cand++;
+ if(max_l0_mvp_cand == num_l0_mvp_cand)
+ {
+ ps_pred_mv->s_l0_mv = as_mv_a[0];
+ l0_done_flag = 1;
+ }
+ }
+ if(avail_a_flag[1])
+ {
+ num_l1_mvp_cand++;
+ if(max_l1_mvp_cand == num_l1_mvp_cand)
+ {
+ ps_pred_mv->s_l1_mv = as_mv_a[1];
+ l1_done_flag = 1;
+ }
+ }
+ if(l0_done_flag && l1_done_flag)
+ return;
+ }
+
+ /************************************************************/
+ /* Calculating of motion vector B from neighbors B0 and B1 */
+ /************************************************************/
+ {
+ WORD32 avail_b_flag[2];
+ mv_t as_mv_b[2];
+
+ /* Pointers to B0, B1 and B2 */
+ {
+ WORD32 x_b0, x_b1, x_b2;
+ WORD32 pu_idx_b0, pu_idx_b1, pu_idx_b2;
+
+ /* Relative co-ordiante of Xp,Yp w.r.t CTB start will work */
+ /* as long as minCTB = 16 */
+ x_b0 = (part_pos_x + part_wd);
+ x_b1 = (part_pos_x + part_wd - 1);
+ x_b2 = (part_pos_x - 1);
+ /* Getting offset back to given pointer */
+ x_b0 = x_b0 - part_pos_x;
+ x_b1 = x_b1 - part_pos_x;
+ x_b2 = x_b2 - part_pos_x;
+
+ /* Below derivation are based on top pointer */
+ /* is pointing first pixel of PU */
+ pu_idx_b0 = *(pu4_top_pu_idx + (x_b0 >> 2));
+ pu_idx_b0 = pu_idx_b0 * tr_avail;
+ pu_idx_b1 = *(pu4_top_pu_idx + (x_b1 >> 2));
+ pu_idx_b1 = pu_idx_b1 * t_avail;
+ /* At CTB boundary, use top-left passed in */
+ if(part_pos_y)
+ {
+ pu_idx_b2 = *pu4_top_left_pu_idx;
+ }
+ else
+ {
+ /* Not at CTB boundary, use top and */
+ /* add correction to go to top-left */
+ pu_idx_b2 = *((pu4_top_pu_idx)+(x_b2 >> 2));
+ }
+ pu_idx_b2 = pu_idx_b2 * tl_avail;
+
+ num_nbrs = 0;
+ nbr_avail[0] = 0;
+ nbr_avail[1] = 0;
+ nbr_avail[2] = 0;
+
+ if(tr_avail && (!ps_pic_pu[pu_idx_b0].b1_intra_flag))
+ {
+ aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_b0];
+ num_nbrs++;
+ nbr_avail[0] = 1;
+ }
+ if(t_avail && (!ps_pic_pu[pu_idx_b1].b1_intra_flag))
+ {
+ aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_b1];
+ num_nbrs++;
+ nbr_avail[1] = 1;
+ }
+ if(tl_avail && (!ps_pic_pu[pu_idx_b2].b1_intra_flag))
+ {
+ aps_nbr_pu[num_nbrs] = &ps_pic_pu[pu_idx_b2];
+ num_nbrs++;
+ nbr_avail[2] = 1;
+ }
+ }
+
+ /* L0 */
+ avail_b_flag[0] = 0;
+ avail_b_flag[1] = 0;
+
+ GET_MV_NBR_ST(ps_ref_pic_list, &avail_b_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_b[0], num_nbrs, 0);
+
+ /* L1 */
+ if(PRED_L0 != ps_pu->b2_pred_mode)
+ {
+ /* B0 Short Term */
+ GET_MV_NBR_ST(ps_ref_pic_list, &avail_b_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_b[1], num_nbrs, 1);
+ }
+
+ if(avail_b_flag[0])
+ {
+ if(((0 == num_l0_mvp_cand)
+ || (as_mv_a[0].i2_mvx != as_mv_b[0].i2_mvx)
+ || (as_mv_a[0].i2_mvy != as_mv_b[0].i2_mvy)))
+ {
+ num_l0_mvp_cand++;
+ if(max_l0_mvp_cand == num_l0_mvp_cand)
+ {
+ ps_pred_mv->s_l0_mv = as_mv_b[0];
+ l0_done_flag = 1;
+ }
+ }
+ }
+ if(avail_b_flag[1])
+ {
+ if(((0 == num_l1_mvp_cand)
+ || (as_mv_a[1].i2_mvx != as_mv_b[1].i2_mvx)
+ || (as_mv_a[1].i2_mvy != as_mv_b[1].i2_mvy)))
+ {
+ num_l1_mvp_cand++;
+ if(max_l1_mvp_cand == num_l1_mvp_cand)
+ {
+ ps_pred_mv->s_l1_mv = as_mv_b[1];
+ l1_done_flag = 1;
+ }
+ }
+ }
+ if(l0_done_flag && l1_done_flag)
+ return;
+
+ if((is_scaled_flag_list == 0) && (avail_b_flag[0] == 1))
+ {
+ avail_a_flag[0] = 1;
+ as_mv_a[0] = as_mv_b[0];
+ }
+ if((is_scaled_flag_list == 0) && (avail_b_flag[1] == 1))
+ {
+ avail_a_flag[1] = 1;
+ as_mv_a[1] = as_mv_b[1];
+ }
+
+ if(0 == is_scaled_flag_list)
+ {
+ avail_b_flag[0] = avail_b_flag[1] = 0;
+
+ GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_b_flag[0], ps_cur_pic_buf_l0, aps_nbr_pu, &as_mv_b[0], num_nbrs, 0);
+
+ if(PRED_L0 != ps_pu->b2_pred_mode)
+ {
+ GET_MV_NBR_LT(ps_ref_pic_list, ps_slice_hdr, &avail_b_flag[1], ps_cur_pic_buf_l1, aps_nbr_pu, &as_mv_b[1], num_nbrs, 1);
+ }
+
+ if(avail_b_flag[0])
+ {
+ if(((0 == num_l0_mvp_cand)
+ || (as_mv_a[0].i2_mvx != as_mv_b[0].i2_mvx)
+ || (as_mv_a[0].i2_mvy != as_mv_b[0].i2_mvy)))
+ {
+ num_l0_mvp_cand++;
+ if(max_l0_mvp_cand == num_l0_mvp_cand)
+ {
+ ps_pred_mv->s_l0_mv = as_mv_b[0];
+ l0_done_flag = 1;
+ }
+ }
+ }
+ if(avail_b_flag[1])
+ {
+ if(((0 == num_l1_mvp_cand)
+ || (as_mv_a[1].i2_mvx != as_mv_b[1].i2_mvx)
+ || (as_mv_a[1].i2_mvy != as_mv_b[1].i2_mvy)))
+ {
+ num_l1_mvp_cand++;
+ if(max_l1_mvp_cand == num_l1_mvp_cand)
+ {
+ ps_pred_mv->s_l1_mv = as_mv_b[1];
+ l1_done_flag = 1;
+ }
+ }
+ }
+ if(l0_done_flag && l1_done_flag)
+ return;
+ }
+ /***********************************************************/
+ /* Collocated MV prediction */
+ /***********************************************************/
+#if 1
+ if((2 != num_l0_mvp_cand) || (2 != num_l1_mvp_cand))
+ {
+ mv_t as_mv_col[2], s_mv_col_l0, s_mv_col_l1;
+ WORD32 avail_col_flag[2] = { 0 };
+ WORD32 x_col, y_col, avail_col_l0, avail_col_l1;
+// ihevcd_collocated_mvp((mv_ctxt_t *)ps_mv_ctxt,ps_pu,part_pos_x,part_pos_y,part_wd,part_ht,as_mv_col,avail_col_flag,1);
+ x_col = part_pos_x + part_wd;
+ y_col = part_pos_y + part_ht;
+ ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 1, x_col, y_col);
+
+ avail_col_l0 = avail_col_flag[0];
+ avail_col_l1 = avail_col_flag[1];
+ if(avail_col_l0 || avail_col_l1)
+ {
+ s_mv_col_l0 = as_mv_col[0];
+ s_mv_col_l1 = as_mv_col[1];
+ }
+
+ if(avail_col_l0 == 0 || avail_col_l1 == 0)
+ {
+ /* Checking Collocated MV availability at Center of PU */
+ x_col = part_pos_x + (part_wd >> 1);
+ y_col = part_pos_y + (part_ht >> 1);
+ ihevcd_collocated_mvp(ps_mv_ctxt, ps_pu, as_mv_col, avail_col_flag, 1, x_col, y_col);
+
+ if(avail_col_l0 == 0)
+ {
+ s_mv_col_l0 = as_mv_col[0];
+ }
+ if(avail_col_l1 == 0)
+ {
+ s_mv_col_l1 = as_mv_col[1];
+ }
+
+ avail_col_l0 |= avail_col_flag[0];
+ avail_col_l1 |= avail_col_flag[1];
+ }
+
+ /* Checking if mvp index matches collocated mv */
+ if(avail_col_l0)
+ {
+ if(2 != num_l0_mvp_cand)
+ {
+ num_l0_mvp_cand++;
+ if(max_l0_mvp_cand == num_l0_mvp_cand)
+ {
+ ps_pred_mv->s_l0_mv = s_mv_col_l0;
+ l0_done_flag = 1;
+ }
+ }
+ }
+ if(avail_col_l1)
+ {
+ if(2 != num_l1_mvp_cand)
+ {
+ num_l1_mvp_cand++;
+ if(max_l1_mvp_cand == num_l1_mvp_cand)
+ {
+ ps_pred_mv->s_l1_mv = s_mv_col_l1;
+ l1_done_flag = 1;
+ }
+ }
+ }
+ if(l0_done_flag && l1_done_flag)
+ return;
+ }
+#endif
+
+ if(0 == l0_done_flag)
+ {
+ ps_pred_mv->s_l0_mv.i2_mvx = 0;
+ ps_pred_mv->s_l0_mv.i2_mvy = 0;
+ }
+ if(0 == l1_done_flag)
+ {
+ ps_pred_mv->s_l1_mv.i2_mvx = 0;
+ ps_pred_mv->s_l1_mv.i2_mvy = 0;
+ }
+ }
+}
diff --git a/decoder/ihevcd_mv_pred.h b/decoder/ihevcd_mv_pred.h
new file mode 100644
index 0000000..b349e58
--- /dev/null
+++ b/decoder/ihevcd_mv_pred.h
@@ -0,0 +1,58 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_structs.h
+ *
+ * @brief
+ * Structure definitions used in the decoder
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef IHEVCD_MV_PRED_H_
+#define IHEVCD_MV_PRED_H_
+void ihevcd_mv_pred(mv_ctxt_t *ps_mv_ctxt,
+ UWORD32 *pu4_top_pu_idx,
+ UWORD32 *pu4_left_pu_idx,
+ UWORD32 *pu4_top_left_pu_idx,
+ WORD32 left_nbr_4x4_strd,
+ pu_t *ps_pu,
+ WORD32 lb_avail,
+ WORD32 l_avail,
+ WORD32 tr_avail,
+ WORD32 t_avail,
+ WORD32 tl_avail,
+ pu_mv_t *ps_pred_mv);
+void ihevcd_scale_mv(mv_t *ps_mv,
+ WORD32 cur_ref_poc,
+ WORD32 nbr_ref_poc,
+ WORD32 cur_poc);
+
+
+#endif /* IHEVCD_MV_PRED_H_ */
diff --git a/decoder/ihevcd_nal.c b/decoder/ihevcd_nal.c
new file mode 100644
index 0000000..cf2208f
--- /dev/null
+++ b/decoder/ihevcd_nal.c
@@ -0,0 +1,458 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_nal.c
+*
+* @brief
+* Contains functions for NAL level such as search start code etc
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_parse_headers.h"
+#include "ihevcd_parse_slice.h"
+#include "ihevcd_debug.h"
+/*****************************************************************************/
+/* Function Prototypes */
+/*****************************************************************************/
+
+/**
+*******************************************************************************
+*
+* @brief
+* Search start code from the given buffer pointer
+*
+* @par Description:
+* Search for start code Return the offset of start code if start code is
+* found If no start code is found till end of given bitstream then treat
+* it as invalid NAL and return end of buffer as offset
+*
+* @param[in] pu1_buf
+* Pointer to bitstream
+*
+* @param[in] bytes_remaining
+* Number of bytes remaining in the buffer
+*
+* @returns Offset to the first byte in NAL after start code
+*
+* @remarks
+* Incomplete start code at the end of input bitstream is not handled. This
+* has to be taken care outside this func
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_nal_search_start_code(UWORD8 *pu1_buf, WORD32 bytes_remaining)
+{
+ WORD32 ofst;
+
+ WORD32 zero_byte_cnt;
+ WORD32 start_code_found;
+
+ ofst = -1;
+
+ zero_byte_cnt = 0;
+ start_code_found = 0;
+ while(ofst < (bytes_remaining - 1))
+ {
+ ofst++;
+ if(pu1_buf[ofst] != 0)
+ {
+ zero_byte_cnt = 0;
+ continue;
+ }
+
+ zero_byte_cnt++;
+ if((pu1_buf[ofst + 1] == START_CODE_PREFIX_BYTE) &&
+ (zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE))
+ {
+ /* Found the start code */
+ ofst++;
+ start_code_found = 1;
+ break;
+ }
+ }
+ if(0 == start_code_found)
+ {
+ if((START_CODE_PREFIX_BYTE == pu1_buf[ofst]) &&
+ (zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE))
+ {
+ /* Found a start code at the end*/
+ ofst++;
+ }
+ }
+ /* Since ofst started at -1, increment it by 1 */
+ ofst++;
+
+ return ofst;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Remove emulation prevention byte present in the bitstream till next start
+* code is found. Emulation prevention byte removed data is stored in a
+* different buffer
+*
+* @par Description:
+* Assumption is first start code is already found and pu1_buf is pointing
+* to a byte after the start code Search for Next NAL's start code Return
+* if start code is found Remove any emulation prevention byte present Copy
+* data to new buffer If no start code is found, then treat complete buffer
+* as one nal.
+*
+* @param[in] pu1_src
+* Pointer to bitstream (excludes the initial the start code)
+*
+* @param[in] pu1_dst
+* Pointer to destination buffer
+*
+* @param[in] bytes_remaining
+* Number of bytes remaining
+*
+* @param[out] pi4_nal_len
+* NAL length (length of bitstream parsed)
+*
+* @param[out] pi4_dst_len
+* Destination bitstream size (length of bitstream parsed with emulation bytes
+* removed)
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+* Incomplete start code at the end of input bitstream is not handled. This
+* has to be taken care outside this func
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_nal_remv_emuln_bytes(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 bytes_remaining,
+ WORD32 *pi4_nal_len,
+ WORD32 *pi4_dst_len)
+{
+ WORD32 src_cnt;
+ WORD32 dst_cnt;
+ WORD32 zero_byte_cnt;
+ WORD32 start_code_found;
+ UWORD8 u1_src;
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+ src_cnt = 0;
+ dst_cnt = 0;
+ zero_byte_cnt = 0;
+ start_code_found = 0;
+ while(src_cnt < (bytes_remaining - 1))
+ {
+ u1_src = pu1_src[src_cnt++];
+
+ pu1_dst[dst_cnt++] = u1_src;
+ if(u1_src != 0)
+ {
+ zero_byte_cnt = 0;
+ continue;
+ }
+
+ zero_byte_cnt++;
+ if(zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE)
+ {
+ u1_src = pu1_src[src_cnt];
+ if(START_CODE_PREFIX_BYTE == u1_src)
+ {
+ /* Found the start code */
+ src_cnt -= zero_byte_cnt;
+ dst_cnt -= zero_byte_cnt;
+ start_code_found = 1;
+ break;
+ }
+ else if(EMULATION_PREVENT_BYTE == u1_src)
+ {
+ /* Found the emulation prevention byte */
+ src_cnt++;
+ zero_byte_cnt = 0;
+
+ /* Decrement dst_cnt so that the next byte overwrites
+ * the emulation prevention byte already copied to dst above
+ */
+ }
+ }
+
+ }
+
+ if(0 == start_code_found)
+ {
+ u1_src = pu1_src[src_cnt++];
+ if(zero_byte_cnt >= NUM_ZEROS_BEFORE_START_CODE)
+ {
+
+ if(START_CODE_PREFIX_BYTE == u1_src)
+ {
+ /* Found a start code at the end*/
+ src_cnt -= zero_byte_cnt;
+ }
+ else if(EMULATION_PREVENT_BYTE == u1_src)
+ {
+ /* Found the emulation prevention byte at the end*/
+ src_cnt++;
+ /* Decrement dst_cnt so that the next byte overwrites
+ * the emulation prevention byte already copied to dst above
+ */
+ dst_cnt--;
+ }
+ }
+ else
+ {
+ pu1_dst[dst_cnt++] = u1_src;
+ }
+
+
+ }
+ *pi4_nal_len = src_cnt;
+ *pi4_dst_len = dst_cnt;
+ return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Decode given NAL unit's header
+*
+* @par Description:
+* Call NAL unit's header decode Section: 7.3.1.2
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream context
+*
+* @param[out] ps_nal
+* Pointer to NAL header
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_nal_unit_header(bitstrm_t *ps_bitstrm, nal_header_t *ps_nal)
+{
+ WORD32 unused;
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ UNUSED(unused);
+ /* Syntax : forbidden_zero_bit */
+ unused = ihevcd_bits_get(ps_bitstrm, 1);
+
+ /* Syntax : nal_unit_type */
+ ps_nal->i1_nal_unit_type = ihevcd_bits_get(ps_bitstrm, 6);
+
+ /* Syntax : nuh_reserved_zero_6bits */
+ unused = ihevcd_bits_get(ps_bitstrm, 6);
+
+ /* Syntax : nuh_temporal_id_plus1 */
+ ps_nal->i1_nuh_temporal_id = ihevcd_bits_get(ps_bitstrm, 3) - 1;
+
+ return ret;
+
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Decode given NAL
+*
+* @par Description:
+* Based on the NAL type call appropriate decode function Section: 7.3.1.1
+*
+*
+* @param[in,out] ps_codec
+* Pointer to codec context (Functions called within will modify contents of
+* ps_codec)
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_nal_unit(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+ /* NAL Header */
+ nal_header_t s_nal;
+
+ ret = ihevcd_nal_unit_header(&ps_codec->s_parse.s_bitstrm, &s_nal);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ if(ps_codec->i4_slice_error)
+ s_nal.i1_nal_unit_type = ps_codec->s_parse.ps_slice_hdr->i1_nal_unit_type;
+
+ /* Setting RASL Output flag */
+ switch(s_nal.i1_nal_unit_type)
+ {
+ case NAL_BLA_W_LP :
+ case NAL_BLA_W_DLP :
+ case NAL_BLA_N_LP :
+ ps_codec->i4_rasl_output_flag = 0;
+ break;
+
+ //TODO: After IDR, there is no case of open GOP
+ //To be fixed appropriately by ignoring RASL only if the
+ // required references are not found
+ case NAL_IDR_W_LP :
+ case NAL_IDR_N_LP :
+ ps_codec->i4_rasl_output_flag = 1;
+ break;
+
+ case NAL_CRA :
+ ps_codec->i4_rasl_output_flag = (0 == ps_codec->u4_pic_cnt) ? 0 : 1;
+ break;
+
+ default:
+ break;
+ }
+
+ switch(s_nal.i1_nal_unit_type)
+ {
+ case NAL_BLA_W_LP :
+ case NAL_BLA_W_DLP :
+ case NAL_BLA_N_LP :
+ case NAL_IDR_W_LP :
+ case NAL_IDR_N_LP :
+ case NAL_CRA :
+ case NAL_TRAIL_N :
+ case NAL_TRAIL_R :
+ case NAL_TSA_N :
+ case NAL_TSA_R :
+ case NAL_STSA_N :
+ case NAL_STSA_R :
+ case NAL_RADL_N :
+ case NAL_RADL_R :
+ case NAL_RASL_N :
+ case NAL_RASL_R :
+ if(ps_codec->i4_header_mode)
+ return IHEVCD_SLICE_IN_HEADER_MODE;
+
+ if((0 == ps_codec->i4_sps_done) ||
+ (0 == ps_codec->i4_pps_done))
+ {
+ return IHEVCD_INVALID_HEADER;
+ }
+
+ ps_codec->i4_header_in_slice_mode = 0;
+
+ ret = ihevcd_parse_slice_header(ps_codec, &s_nal);
+ DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+ if(ret == (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ {
+ if((s_nal.i1_nal_unit_type != NAL_RASL_N && s_nal.i1_nal_unit_type != NAL_RASL_R) ||
+ ps_codec->i4_rasl_output_flag ||
+ ps_codec->i4_slice_error)
+ ret = ihevcd_parse_slice_data(ps_codec);
+ }
+ break;
+
+ case NAL_VPS :
+ // ret = ihevcd_parse_vps(ps_codec);
+ DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+ break;
+
+ case NAL_SPS :
+ if(0 == ps_codec->i4_header_mode)
+ {
+ ps_codec->i4_header_in_slice_mode = 1;
+ if(ps_codec->i4_sps_done &&
+ ps_codec->i4_pic_present)
+ break;
+ }
+
+ ret = ihevcd_parse_sps(ps_codec);
+ if(ret == (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ {
+ sps_t *ps_sps = ps_codec->ps_sps_base + MAX_SPS_CNT - 1;
+ ihevcd_copy_sps(ps_codec, ps_sps->i1_sps_id, MAX_SPS_CNT - 1);
+ }
+
+ DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+ break;
+
+ case NAL_PPS :
+ if(0 == ps_codec->i4_header_mode)
+ {
+ ps_codec->i4_header_in_slice_mode = 1;
+ if(ps_codec->i4_pps_done &&
+ ps_codec->i4_pic_present)
+ break;
+ }
+
+ ret = ihevcd_parse_pps(ps_codec);
+ if(ret == (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ {
+ pps_t *ps_pps = ps_codec->ps_pps_base + MAX_PPS_CNT - 1;
+ ihevcd_copy_pps(ps_codec, ps_pps->i1_pps_id, MAX_PPS_CNT - 1);
+ }
+
+ DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+ break;
+
+ default:
+ DEBUG_PRINT_NAL_INFO(ps_codec, s_nal.i1_nal_unit_type);
+ break;
+ }
+
+ return ret;
+}
+
diff --git a/decoder/ihevcd_nal.h b/decoder/ihevcd_nal.h
new file mode 100644
index 0000000..b7d09d0
--- /dev/null
+++ b/decoder/ihevcd_nal.h
@@ -0,0 +1,69 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_nal.h
+*
+* @brief
+* Header for NAL related function
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_NAL_H_
+#define _IHEVCD_NAL_H_
+/**
+ * Minimum size of start code including NAL type
+ */
+
+#define MIN_START_CODE_LEN 4
+/**
+ * Start code prefix byte - 1
+ */
+#define START_CODE_PREFIX_BYTE 1
+
+/**
+ * Emulation prevention byte - 3
+ */
+
+#define EMULATION_PREVENT_BYTE 3
+/**
+ * Minimum number of zeros before start code
+ */
+#define NUM_ZEROS_BEFORE_START_CODE 2
+
+
+WORD32 ihevcd_nal_search_start_code(UWORD8 *pu1_buf, WORD32 bytes_remaining);
+
+IHEVCD_ERROR_T ihevcd_nal_remv_emuln_bytes(UWORD8 *pu1_src,
+ UWORD8 *pu1_dst,
+ WORD32 bytes_remaining,
+ WORD32 *pi4_nal_len,
+ WORD32 *pi4_dst_len);
+
+IHEVCD_ERROR_T ihevcd_nal_unit(codec_t *ps_codec);
+#endif /* _IHEVCD_NAL_H_ */
diff --git a/decoder/ihevcd_parse_headers.c b/decoder/ihevcd_parse_headers.c
new file mode 100644
index 0000000..76240f9
--- /dev/null
+++ b/decoder/ihevcd_parse_headers.c
@@ -0,0 +1,2267 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_headers.c
+*
+* @brief
+* Contains functions for parsing headers
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_quant_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_parse_headers.h"
+#include "ihevcd_ref_list.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+
+#define COPY_DEFAULT_SCALING_LIST(pi2_scaling_mat) \
+{ \
+ WORD32 scaling_mat_offset[]={0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992, 1248, 1504, 1760, 2016, 3040}; \
+ \
+ /* scaling matrix for 4x4 */ \
+ memcpy(pi2_scaling_mat, gi2_flat_scale_mat_32x32, 6*16*sizeof(WORD16)); \
+/* scaling matrix for 8x8 */ \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[6], gi2_intra_default_scale_mat_8x8, 64*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[7], gi2_intra_default_scale_mat_8x8, 64*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[8], gi2_intra_default_scale_mat_8x8, 64*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[9], gi2_inter_default_scale_mat_8x8, 64*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[10], gi2_inter_default_scale_mat_8x8, 64*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[11], gi2_inter_default_scale_mat_8x8, 64*sizeof(WORD16)); \
+ /* scaling matrix for 16x16 */ \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[12], gi2_intra_default_scale_mat_16x16, 256*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[13], gi2_intra_default_scale_mat_16x16, 256*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[14], gi2_intra_default_scale_mat_16x16, 256*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[15], gi2_inter_default_scale_mat_16x16, 256*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[16], gi2_inter_default_scale_mat_16x16, 256*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[17], gi2_inter_default_scale_mat_16x16, 256*sizeof(WORD16)); \
+ /* scaling matrix for 32x32 */ \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[18], gi2_intra_default_scale_mat_32x32, 1024*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[19], gi2_inter_default_scale_mat_32x32, 1024*sizeof(WORD16)); \
+}
+
+#define COPY_FLAT_SCALING_LIST(pi2_scaling_mat) \
+{ \
+ WORD32 scaling_mat_offset[]={0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992, 1248, 1504, 1760, 2016, 3040}; \
+ \
+ /* scaling matrix for 4x4 */ \
+ memcpy(pi2_scaling_mat, gi2_flat_scale_mat_32x32, 6*16*sizeof(WORD16)); \
+ /* scaling matrix for 8x8 */ \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[6], gi2_flat_scale_mat_32x32, 6*64*sizeof(WORD16)); \
+ /* scaling matrix for 16x16 */ \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[12], gi2_flat_scale_mat_32x32, 3*256*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[15], gi2_flat_scale_mat_32x32, 3*256*sizeof(WORD16)); \
+ /* scaling matrix for 32x32 */ \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[18], gi2_flat_scale_mat_32x32, 1024*sizeof(WORD16)); \
+ memcpy(pi2_scaling_mat + scaling_mat_offset[19], gi2_flat_scale_mat_32x32, 1024*sizeof(WORD16)); \
+}
+
+/* Function declarations */
+
+#if 0
+/**
+*******************************************************************************
+*
+* @brief
+* Parses VPS operation point
+*
+* @par Description
+* Parses VPS operation point as per section 7.3.5
+*
+* @param[out] ps_vps
+* Pointer to VPS structure
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] ops_idx
+* Operating point index
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_operation_point_set( vps_t *ps_vps, bitstrm_t *ps_bitstrm, WORD32 ops_idx)
+{
+ WORD32 i;
+ WORD32 value;
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ for( i = 0; i <= ps_vps->i1_vps_max_nuh_reserved_zero_layer_id; i++ )
+ {
+ BITS_PARSE("list_entry_l0[ i ]", value, ps_bitstrm, 1);
+ //ps_vps->ai1_layer_id_included_flag[ops_idx][i] = value;
+
+ }
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses pic_lismod_t (picture list mod syntax) Section:7.3.8.3 Reference
+* picture list mod syntax
+*
+* @par Description:
+* Parse pict list mod synt and update pic_lismod_t struct
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_ref_pic_list_modification(bitstrm_t *ps_bitstrm,
+ slice_header_t *ps_slice_hdr,
+ WORD32 num_poc_total_curr)
+{
+ WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+ WORD32 i;
+ rplm_t *ps_rplm;
+ WORD32 num_bits_list_entry;
+
+ ps_rplm = &(ps_slice_hdr->s_rplm);
+
+ /* Calculate Ceil(Log2(num_poc_total_curr)) */
+ {
+ num_bits_list_entry = 32 - CLZ(num_poc_total_curr);
+ /* Check if num_poc_total_curr is power of 2 */
+ if(0 == (num_poc_total_curr & (num_poc_total_curr - 1)))
+ {
+ num_bits_list_entry--;
+ }
+ }
+
+ if(ps_slice_hdr->i1_slice_type == PSLICE || ps_slice_hdr->i1_slice_type == BSLICE)
+ {
+ BITS_PARSE("ref_pic_list_modification_flag_l0", value, ps_bitstrm, 1);
+ ps_rplm->i1_ref_pic_list_modification_flag_l0 = value;
+
+ if(ps_rplm->i1_ref_pic_list_modification_flag_l0)
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ BITS_PARSE("list_entry_l0", value, ps_bitstrm, num_bits_list_entry);
+ ps_rplm->i1_list_entry_l0[i] = value;
+
+ ps_rplm->i1_list_entry_l0[i] = CLIP3(ps_rplm->i1_list_entry_l0[i], 0, num_poc_total_curr - 1);
+ }
+ }
+
+ if(ps_slice_hdr->i1_slice_type == BSLICE)
+ {
+ BITS_PARSE("ref_pic_list_modification_flag_l1", value, ps_bitstrm, 1);
+ ps_rplm->i1_ref_pic_list_modification_flag_l1 = value;
+
+ if(ps_rplm->i1_ref_pic_list_modification_flag_l1)
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ BITS_PARSE("list_entry_l1", value, ps_bitstrm, num_bits_list_entry);
+ ps_rplm->i1_list_entry_l1[i] = value;
+
+ ps_rplm->i1_list_entry_l1[i] = CLIP3(ps_rplm->i1_list_entry_l1[i], 0, num_poc_total_curr - 1);
+ }
+
+ }
+
+ return ret;
+}
+#endif
+/**
+*******************************************************************************
+*
+* @brief
+* Parses Prediction weight table syntax
+*
+* @par Description:
+* Parse Prediction weight table syntax as per Section: 7.3.8.4
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream context
+*
+* @param[in] ps_sps
+* Current SPS
+*
+* @param[in] ps_pps
+* Current PPS
+*
+* @param[in] ps_slice_hdr
+* Current Slice header
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_parse_pred_wt_ofst(bitstrm_t *ps_bitstrm,
+ sps_t *ps_sps,
+ pps_t *ps_pps,
+ slice_header_t *ps_slice_hdr)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+ WORD32 i;
+ UNUSED(ps_pps);
+ pred_wt_ofst_t *ps_wt_ofst = &ps_slice_hdr->s_wt_ofst;
+
+ UEV_PARSE("luma_log2_weight_denom", value, ps_bitstrm);
+ ps_wt_ofst->i1_luma_log2_weight_denom = value;
+
+ if(ps_sps->i1_chroma_format_idc != 0)
+ {
+ SEV_PARSE("delta_chroma_log2_weight_denom", value, ps_bitstrm);
+ ps_wt_ofst->i1_chroma_log2_weight_denom = ps_wt_ofst->i1_luma_log2_weight_denom + value;
+ }
+
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ BITS_PARSE("luma_weight_l0_flag[ i ]", value, ps_bitstrm, 1);
+ ps_wt_ofst->i1_luma_weight_l0_flag[i] = value;
+ }
+
+
+
+ if(ps_sps->i1_chroma_format_idc != 0)
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ BITS_PARSE("chroma_weight_l0_flag[ i ]", value, ps_bitstrm, 1);
+ ps_wt_ofst->i1_chroma_weight_l0_flag[i] = value;
+ }
+ }
+ else
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ ps_wt_ofst->i1_chroma_weight_l0_flag[i] = 0;
+ }
+ }
+
+
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ if(ps_wt_ofst->i1_luma_weight_l0_flag[i])
+ {
+ SEV_PARSE("delta_luma_weight_l0[ i ]", value, ps_bitstrm);
+
+
+ ps_wt_ofst->i2_luma_weight_l0[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom) + value;
+
+ SEV_PARSE("luma_offset_l0[ i ]", value, ps_bitstrm);
+ ps_wt_ofst->i2_luma_offset_l0[i] = value;
+
+ }
+ else
+ {
+ ps_wt_ofst->i2_luma_weight_l0[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom);
+ ps_wt_ofst->i2_luma_offset_l0[i] = 0;
+ }
+ if(ps_wt_ofst->i1_chroma_weight_l0_flag[i])
+ {
+ WORD32 ofst;
+ WORD32 shift = (1 << (BIT_DEPTH_CHROMA - 1));
+ SEV_PARSE("delta_chroma_weight_l0[ i ][ j ]", value, ps_bitstrm);
+ ps_wt_ofst->i2_chroma_weight_l0_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;
+
+
+ SEV_PARSE("delta_chroma_offset_l0[ i ][ j ]", value, ps_bitstrm);
+ ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l0_cb[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+ ofst = value - ofst + shift;
+
+ ps_wt_ofst->i2_chroma_offset_l0_cb[i] = CLIP_S8(ofst);
+
+ SEV_PARSE("delta_chroma_weight_l0[ i ][ j ]", value, ps_bitstrm);
+ ps_wt_ofst->i2_chroma_weight_l0_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;
+
+
+ SEV_PARSE("delta_chroma_offset_l0[ i ][ j ]", value, ps_bitstrm);
+ ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l0_cr[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+ ofst = value - ofst + shift;
+
+ ps_wt_ofst->i2_chroma_offset_l0_cr[i] = CLIP_S8(ofst);
+
+ }
+ else
+ {
+ ps_wt_ofst->i2_chroma_weight_l0_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+ ps_wt_ofst->i2_chroma_weight_l0_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+
+ ps_wt_ofst->i2_chroma_offset_l0_cb[i] = 0;
+ ps_wt_ofst->i2_chroma_offset_l0_cr[i] = 0;
+ }
+ }
+ if(BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ BITS_PARSE("luma_weight_l1_flag[ i ]", value, ps_bitstrm, 1);
+ ps_wt_ofst->i1_luma_weight_l1_flag[i] = value;
+ }
+
+ if(ps_sps->i1_chroma_format_idc != 0)
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ BITS_PARSE("chroma_weight_l1_flag[ i ]", value, ps_bitstrm, 1);
+ ps_wt_ofst->i1_chroma_weight_l1_flag[i] = value;
+ }
+ }
+ else
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ ps_wt_ofst->i1_chroma_weight_l1_flag[i] = 0;
+ }
+ }
+
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ if(ps_wt_ofst->i1_luma_weight_l1_flag[i])
+ {
+ SEV_PARSE("delta_luma_weight_l1[ i ]", value, ps_bitstrm);
+
+
+ ps_wt_ofst->i2_luma_weight_l1[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom) + value;
+
+ SEV_PARSE("luma_offset_l1[ i ]", value, ps_bitstrm);
+ ps_wt_ofst->i2_luma_offset_l1[i] = value;
+
+ }
+ else
+ {
+ ps_wt_ofst->i2_luma_weight_l1[i] = (1 << ps_wt_ofst->i1_luma_log2_weight_denom);
+ ps_wt_ofst->i2_luma_offset_l1[i] = 0;
+ }
+
+ if(ps_wt_ofst->i1_chroma_weight_l1_flag[i])
+ {
+ WORD32 ofst;
+ WORD32 shift = (1 << (BIT_DEPTH_CHROMA - 1));
+ SEV_PARSE("delta_chroma_weight_l1[ i ][ j ]", value, ps_bitstrm);
+ ps_wt_ofst->i2_chroma_weight_l1_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;;
+
+
+ SEV_PARSE("delta_chroma_offset_l1[ i ][ j ]", value, ps_bitstrm);
+ ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l1_cb[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+ ofst = value - ofst + shift;
+
+ ps_wt_ofst->i2_chroma_offset_l1_cb[i] = CLIP_S8(ofst);;
+
+ SEV_PARSE("delta_chroma_weight_l1[ i ][ j ]", value, ps_bitstrm);
+ ps_wt_ofst->i2_chroma_weight_l1_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom) + value;
+
+
+ SEV_PARSE("delta_chroma_offset_l1[ i ][ j ]", value, ps_bitstrm);
+ ofst = ((shift * ps_wt_ofst->i2_chroma_weight_l1_cr[i]) >> ps_wt_ofst->i1_chroma_log2_weight_denom);
+ ofst = value - ofst + shift;
+
+ ps_wt_ofst->i2_chroma_offset_l1_cr[i] = CLIP_S8(ofst);;
+
+ }
+ else
+ {
+ ps_wt_ofst->i2_chroma_weight_l1_cb[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+ ps_wt_ofst->i2_chroma_weight_l1_cr[i] = (1 << ps_wt_ofst->i1_chroma_log2_weight_denom);
+
+ ps_wt_ofst->i2_chroma_offset_l1_cb[i] = 0;
+ ps_wt_ofst->i2_chroma_offset_l1_cr[i] = 0;
+
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses short term reference picture set
+*
+* @par Description
+* Parses short term reference picture set as per section 7.3.8.2.
+* Can be called by either SPS or Slice header parsing modules.
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[out] ps_stref_picset_base
+* Pointer to first short term ref pic set structure
+*
+* @param[in] num_short_term_ref_pic_sets
+* Number of short term reference pic sets
+*
+* @param[in] idx
+* Current short term ref pic set id
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_short_term_ref_pic_set(bitstrm_t *ps_bitstrm,
+ stref_picset_t *ps_stref_picset_base,
+ WORD32 num_short_term_ref_pic_sets,
+ WORD32 idx,
+ stref_picset_t *ps_stref_picset)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+ stref_picset_t *ps_stref_picset_ref;
+ WORD32 delta_idx, delta_rps;
+ WORD32 r_idx;
+ WORD32 i;
+ WORD32 j, k, temp;
+ if(idx > 0)
+ {
+ BITS_PARSE("inter_ref_pic_set_prediction_flag", value, ps_bitstrm, 1);
+ ps_stref_picset->i1_inter_ref_pic_set_prediction_flag = value;
+ }
+ else
+ ps_stref_picset->i1_inter_ref_pic_set_prediction_flag = 0;
+
+ if(ps_stref_picset->i1_inter_ref_pic_set_prediction_flag)
+ {
+ WORD32 delta_rps_sign;
+ WORD32 abs_delta_rps;
+ WORD32 num_neg_pics = 0;
+ WORD32 num_pos_pics = 0;
+ WORD32 num_pics = 0;
+
+ if(idx == num_short_term_ref_pic_sets)
+ {
+ UEV_PARSE("delta_idx_minus1", value, ps_bitstrm);
+ delta_idx = value + 1;
+ }
+ else
+ {
+ delta_idx = 1;
+ }
+ r_idx = idx - delta_idx;
+ r_idx = CLIP3(r_idx, 0, idx - 1);
+
+ ps_stref_picset_ref = ps_stref_picset_base + r_idx;
+
+ BITS_PARSE("delta_rps_sign", value, ps_bitstrm, 1);
+ delta_rps_sign = value;
+
+ UEV_PARSE("abs_delta_rps_minus1", value, ps_bitstrm);
+ abs_delta_rps = value + 1;
+
+ delta_rps = (1 - 2 * delta_rps_sign) * (abs_delta_rps);
+
+
+
+ for(i = 0; i <= ps_stref_picset_ref->i1_num_delta_pocs; i++)
+ {
+ WORD32 ref_idc;
+
+ /*****************************************************************/
+ /* ref_idc is parsed as below */
+ /* bits "1" ref_idc 1 */
+ /* bits "01" ref_idc 2 */
+ /* bits "00" ref_idc 0 */
+ /*****************************************************************/
+ BITS_PARSE("used_by_curr_pic_flag", value, ps_bitstrm, 1);
+ ref_idc = value;
+ ps_stref_picset->ai1_used[num_pics] = value;
+ /* If ref_idc is zero check for next bit */
+ if(0 == ref_idc)
+ {
+ BITS_PARSE("use_delta_flag", value, ps_bitstrm, 1);
+ ps_stref_picset->ai1_used[i] = value;
+ ref_idc = value << 1;
+ }
+ if((ref_idc == 1) || (ref_idc == 2))
+ {
+ WORD32 delta_poc;
+ delta_poc = delta_rps;
+ delta_poc +=
+ ((i < ps_stref_picset_ref->i1_num_delta_pocs) ?
+ ps_stref_picset_ref->ai2_delta_poc[i] :
+ 0);
+
+ ps_stref_picset->ai2_delta_poc[num_pics] = delta_poc;
+
+ if(delta_poc < 0)
+ {
+ num_neg_pics++;
+ }
+ else
+ {
+ num_pos_pics++;
+ }
+ num_pics++;
+ }
+ ps_stref_picset->ai1_ref_idc[i] = ref_idc;
+ }
+
+ num_neg_pics = CLIP3(num_neg_pics, 0, MAX_DPB_SIZE - 1);
+ num_pos_pics = CLIP3(num_pos_pics, 0, (MAX_DPB_SIZE - 1 - num_neg_pics));
+ num_pics = num_neg_pics + num_pos_pics;
+
+ ps_stref_picset->i1_num_ref_idc =
+ ps_stref_picset_ref->i1_num_delta_pocs + 1;
+ ps_stref_picset->i1_num_delta_pocs = num_pics;
+ ps_stref_picset->i1_num_pos_pics = num_pos_pics;
+ ps_stref_picset->i1_num_neg_pics = num_neg_pics;
+
+
+ for(j = 1; j < num_pics; j++)
+ {
+ WORD32 delta_poc = ps_stref_picset->ai2_delta_poc[j];
+ WORD8 i1_used = ps_stref_picset->ai1_used[j];
+ for(k = j - 1; k >= 0; k--)
+ {
+ temp = ps_stref_picset->ai2_delta_poc[k];
+ if(delta_poc < temp)
+ {
+ ps_stref_picset->ai2_delta_poc[k + 1] = temp;
+ ps_stref_picset->ai1_used[k + 1] = ps_stref_picset->ai1_used[k];
+ ps_stref_picset->ai2_delta_poc[k] = delta_poc;
+ ps_stref_picset->ai1_used[k] = i1_used;
+ }
+ }
+ }
+ // flip the negative values to largest first
+ for(j = 0, k = num_neg_pics - 1; j < num_neg_pics >> 1; j++, k--)
+ {
+ WORD32 delta_poc = ps_stref_picset->ai2_delta_poc[j];
+ WORD8 i1_used = ps_stref_picset->ai1_used[j];
+ ps_stref_picset->ai2_delta_poc[j] = ps_stref_picset->ai2_delta_poc[k];
+ ps_stref_picset->ai1_used[j] = ps_stref_picset->ai1_used[k];
+ ps_stref_picset->ai2_delta_poc[k] = delta_poc;
+ ps_stref_picset->ai1_used[k] = i1_used;
+ }
+
+ }
+ else
+ {
+ WORD32 prev_poc = 0;
+ WORD32 poc;
+
+ UEV_PARSE("num_negative_pics", value, ps_bitstrm);
+ ps_stref_picset->i1_num_neg_pics = value;
+ ps_stref_picset->i1_num_neg_pics = CLIP3(ps_stref_picset->i1_num_neg_pics,
+ 0,
+ MAX_DPB_SIZE - 1);
+
+ UEV_PARSE("num_positive_pics", value, ps_bitstrm);
+ ps_stref_picset->i1_num_pos_pics = value;
+ ps_stref_picset->i1_num_pos_pics = CLIP3(ps_stref_picset->i1_num_pos_pics,
+ 0,
+ (MAX_DPB_SIZE - 1 - ps_stref_picset->i1_num_neg_pics));
+
+ ps_stref_picset->i1_num_delta_pocs =
+ ps_stref_picset->i1_num_neg_pics +
+ ps_stref_picset->i1_num_pos_pics;
+
+
+ for(i = 0; i < ps_stref_picset->i1_num_neg_pics; i++)
+ {
+ UEV_PARSE("delta_poc_s0_minus1", value, ps_bitstrm);
+ poc = prev_poc - (value + 1);
+ prev_poc = poc;
+ ps_stref_picset->ai2_delta_poc[i] = poc;
+
+ BITS_PARSE("used_by_curr_pic_s0_flag", value, ps_bitstrm, 1);
+ ps_stref_picset->ai1_used[i] = value;
+
+ }
+ prev_poc = 0;
+ for(i = ps_stref_picset->i1_num_neg_pics;
+ i < ps_stref_picset->i1_num_delta_pocs;
+ i++)
+ {
+ UEV_PARSE("delta_poc_s1_minus1", value, ps_bitstrm);
+ poc = prev_poc + (value + 1);
+ prev_poc = poc;
+ ps_stref_picset->ai2_delta_poc[i] = poc;
+
+ BITS_PARSE("used_by_curr_pic_s1_flag", value, ps_bitstrm, 1);
+ ps_stref_picset->ai1_used[i] = value;
+
+ }
+
+ }
+
+ return ret;
+}
+
+
+static WORD32 ihevcd_parse_sub_layer_hrd_parameters(bitstrm_t *ps_bitstrm,
+ sub_lyr_hrd_params_t *ps_sub_layer_hrd_params,
+ WORD32 cpb_cnt,
+ WORD32 sub_pic_cpb_params_present_flag)
+{
+ WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 i;
+
+ for(i = 0; i <= cpb_cnt; i++)
+ {
+ UEV_PARSE("bit_rate_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_bit_rate_value_minus1[i], ps_bitstrm);
+ UEV_PARSE("cpb_size_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_cpb_size_value_minus1[i], ps_bitstrm);
+
+ if(sub_pic_cpb_params_present_flag)
+ {
+ UEV_PARSE("cpb_size_du_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_cpb_size_du_value_minus1[i], ps_bitstrm);
+ UEV_PARSE("bit_rate_du_value_minus1[ i ]", ps_sub_layer_hrd_params->au4_bit_rate_du_value_minus1[i], ps_bitstrm);
+ }
+ BITS_PARSE("cbr_flag[ i ]", ps_sub_layer_hrd_params->au1_cbr_flag[i], ps_bitstrm, 1);
+ }
+
+ return ret;
+}
+
+
+static WORD32 ihevcd_parse_hrd_parameters(bitstrm_t *ps_bitstrm,
+ hrd_params_t *ps_hrd,
+ WORD32 common_info_present_flag,
+ WORD32 max_num_sub_layers_minus1)
+{
+ WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 i;
+
+ ps_hrd->u1_nal_hrd_parameters_present_flag = 0;
+ ps_hrd->u1_vcl_hrd_parameters_present_flag = 0;
+
+ ps_hrd->u1_sub_pic_cpb_params_present_flag = 0;
+
+ ps_hrd->u1_tick_divisor_minus2 = 0;
+ ps_hrd->u1_du_cpb_removal_delay_increment_length_minus1 = 0;
+ ps_hrd->u1_sub_pic_cpb_params_in_pic_timing_sei_flag = 0;
+ ps_hrd->u1_dpb_output_delay_du_length_minus1 = 0;
+
+ ps_hrd->u4_bit_rate_scale = 0;
+ ps_hrd->u4_cpb_size_scale = 0;
+ ps_hrd->u4_cpb_size_du_scale = 0;
+
+ ps_hrd->u1_initial_cpb_removal_delay_length_minus1 = 23;
+ ps_hrd->u1_au_cpb_removal_delay_length_minus1 = 23;
+ ps_hrd->u1_dpb_output_delay_length_minus1 = 23;
+
+ if(common_info_present_flag)
+ {
+ BITS_PARSE("nal_hrd_parameters_present_flag", ps_hrd->u1_nal_hrd_parameters_present_flag, ps_bitstrm, 1);
+ BITS_PARSE("vcl_hrd_parameters_present_flag", ps_hrd->u1_vcl_hrd_parameters_present_flag, ps_bitstrm, 1);
+
+ if(ps_hrd->u1_nal_hrd_parameters_present_flag || ps_hrd->u1_vcl_hrd_parameters_present_flag)
+ {
+ BITS_PARSE("sub_pic_cpb_params_present_flag", ps_hrd->u1_sub_pic_cpb_params_present_flag, ps_bitstrm, 1);
+ if(ps_hrd->u1_sub_pic_cpb_params_present_flag)
+ {
+ BITS_PARSE("tick_divisor_minus2", ps_hrd->u1_tick_divisor_minus2, ps_bitstrm, 8);
+ BITS_PARSE("du_cpb_removal_delay_increment_length_minus1", ps_hrd->u1_du_cpb_removal_delay_increment_length_minus1, ps_bitstrm, 5);
+ BITS_PARSE("sub_pic_cpb_params_in_pic_timing_sei_flag", ps_hrd->u1_sub_pic_cpb_params_in_pic_timing_sei_flag, ps_bitstrm, 1);
+ BITS_PARSE("dpb_output_delay_du_length_minus1", ps_hrd->u1_dpb_output_delay_du_length_minus1, ps_bitstrm, 5);
+ }
+
+ BITS_PARSE("bit_rate_scale", ps_hrd->u4_bit_rate_scale, ps_bitstrm, 4);
+ BITS_PARSE("cpb_size_scale", ps_hrd->u4_cpb_size_scale, ps_bitstrm, 4);
+ if(ps_hrd->u1_sub_pic_cpb_params_present_flag)
+ BITS_PARSE("cpb_size_du_scale", ps_hrd->u4_cpb_size_du_scale, ps_bitstrm, 4);
+
+ BITS_PARSE("initial_cpb_removal_delay_length_minus1", ps_hrd->u1_initial_cpb_removal_delay_length_minus1, ps_bitstrm, 5);
+ BITS_PARSE("au_cpb_removal_delay_length_minus1", ps_hrd->u1_au_cpb_removal_delay_length_minus1, ps_bitstrm, 5);
+ BITS_PARSE("dpb_output_delay_length_minus1", ps_hrd->u1_dpb_output_delay_length_minus1, ps_bitstrm, 5);
+ }
+ }
+
+
+ for(i = 0; i <= max_num_sub_layers_minus1; i++)
+ {
+ BITS_PARSE("fixed_pic_rate_general_flag[ i ]", ps_hrd->au1_fixed_pic_rate_general_flag[i], ps_bitstrm, 1);
+
+ ps_hrd->au1_fixed_pic_rate_within_cvs_flag[i] = 1;
+ ps_hrd->au1_elemental_duration_in_tc_minus1[i] = 0;
+ ps_hrd->au1_low_delay_hrd_flag[i] = 0;
+ ps_hrd->au1_cpb_cnt_minus1[i] = 0;
+
+ if(!ps_hrd->au1_fixed_pic_rate_general_flag[i])
+ BITS_PARSE("fixed_pic_rate_within_cvs_flag[ i ]", ps_hrd->au1_fixed_pic_rate_within_cvs_flag[i], ps_bitstrm, 1);
+
+ if(ps_hrd->au1_fixed_pic_rate_within_cvs_flag[i])
+ {
+ UEV_PARSE("elemental_duration_in_tc_minus1[ i ]", ps_hrd->au1_elemental_duration_in_tc_minus1[i], ps_bitstrm);
+ }
+ else
+ {
+ BITS_PARSE("low_delay_hrd_flag[ i ]", ps_hrd->au1_low_delay_hrd_flag[i], ps_bitstrm, 1);
+ }
+
+ if(!ps_hrd->au1_low_delay_hrd_flag[i])
+ UEV_PARSE("cpb_cnt_minus1[ i ]", ps_hrd->au1_cpb_cnt_minus1[i], ps_bitstrm);
+
+ if(ps_hrd->u1_nal_hrd_parameters_present_flag)
+ ihevcd_parse_sub_layer_hrd_parameters(ps_bitstrm,
+ &ps_hrd->as_sub_layer_hrd_params[i],
+ ps_hrd->au1_cpb_cnt_minus1[i],
+ ps_hrd->u1_sub_pic_cpb_params_present_flag);
+
+ if(ps_hrd->u1_vcl_hrd_parameters_present_flag)
+ ihevcd_parse_sub_layer_hrd_parameters(ps_bitstrm,
+ &ps_hrd->as_sub_layer_hrd_params[i],
+ ps_hrd->au1_cpb_cnt_minus1[i],
+ ps_hrd->u1_sub_pic_cpb_params_present_flag);
+ }
+
+ return ret;
+}
+
+
+static WORD32 ihevcd_parse_vui_parameters(bitstrm_t *ps_bitstrm,
+ vui_t *ps_vui,
+ WORD32 sps_max_sub_layers_minus1)
+{
+ WORD32 ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+ BITS_PARSE("aspect_ratio_info_present_flag", ps_vui->u1_aspect_ratio_info_present_flag, ps_bitstrm, 1);
+
+ ps_vui->u1_aspect_ratio_idc = SAR_UNUSED;
+ ps_vui->u2_sar_width = 0;
+ ps_vui->u2_sar_height = 0;
+ if(ps_vui->u1_aspect_ratio_info_present_flag)
+ {
+ BITS_PARSE("aspect_ratio_idc", ps_vui->u1_aspect_ratio_idc, ps_bitstrm, 8);
+ if(ps_vui->u1_aspect_ratio_idc == EXTENDED_SAR)
+ {
+ BITS_PARSE("sar_width", ps_vui->u2_sar_width, ps_bitstrm, 16);
+ BITS_PARSE("sar_height", ps_vui->u2_sar_height, ps_bitstrm, 16);
+ }
+ }
+
+ BITS_PARSE("overscan_info_present_flag", ps_vui->u1_overscan_info_present_flag, ps_bitstrm, 1);
+ ps_vui->u1_overscan_appropriate_flag = 0;
+ if(ps_vui->u1_overscan_info_present_flag)
+ BITS_PARSE("overscan_appropriate_flag", ps_vui->u1_overscan_appropriate_flag, ps_bitstrm, 1);
+
+ BITS_PARSE("video_signal_type_present_flag", ps_vui->u1_video_signal_type_present_flag, ps_bitstrm, 1);
+ ps_vui->u1_video_format = VID_FMT_UNSPECIFIED;
+ ps_vui->u1_video_full_range_flag = 0;
+ ps_vui->u1_colour_description_present_flag = 0;
+ if(ps_vui->u1_video_signal_type_present_flag)
+ {
+ BITS_PARSE("video_format", ps_vui->u1_video_format, ps_bitstrm, 3);
+ BITS_PARSE("video_full_range_flag", ps_vui->u1_video_full_range_flag, ps_bitstrm, 1);
+ BITS_PARSE("colour_description_present_flag", ps_vui->u1_colour_description_present_flag, ps_bitstrm, 1);
+ ps_vui->u1_colour_primaries = 2;
+ ps_vui->u1_transfer_characteristics = 2;
+ if(ps_vui->u1_colour_description_present_flag)
+ {
+ BITS_PARSE("colour_primaries", ps_vui->u1_colour_primaries, ps_bitstrm, 8);
+ BITS_PARSE("transfer_characteristics", ps_vui->u1_transfer_characteristics, ps_bitstrm, 8);
+ BITS_PARSE("matrix_coeffs", ps_vui->u1_matrix_coefficients, ps_bitstrm, 8);
+ }
+ }
+
+ BITS_PARSE("chroma_loc_info_present_flag", ps_vui->u1_chroma_loc_info_present_flag, ps_bitstrm, 1);
+ ps_vui->u1_chroma_sample_loc_type_top_field = 0;
+ ps_vui->u1_chroma_sample_loc_type_bottom_field = 0;
+ if(ps_vui->u1_chroma_loc_info_present_flag)
+ {
+ UEV_PARSE("chroma_sample_loc_type_top_field", ps_vui->u1_chroma_sample_loc_type_top_field, ps_bitstrm);
+ UEV_PARSE("chroma_sample_loc_type_bottom_field", ps_vui->u1_chroma_sample_loc_type_bottom_field, ps_bitstrm);
+ }
+
+ BITS_PARSE("neutral_chroma_indication_flag", ps_vui->u1_neutral_chroma_indication_flag, ps_bitstrm, 1);
+ BITS_PARSE("field_seq_flag", ps_vui->u1_field_seq_flag, ps_bitstrm, 1);
+ BITS_PARSE("frame_field_info_present_flag", ps_vui->u1_frame_field_info_present_flag, ps_bitstrm, 1);
+ BITS_PARSE("default_display_window_flag", ps_vui->u1_default_display_window_flag, ps_bitstrm, 1);
+ ps_vui->u4_def_disp_win_left_offset = 0;
+ ps_vui->u4_def_disp_win_right_offset = 0;
+ ps_vui->u4_def_disp_win_top_offset = 0;
+ ps_vui->u4_def_disp_win_bottom_offset = 0;
+ if(ps_vui->u1_default_display_window_flag)
+ {
+ UEV_PARSE("def_disp_win_left_offset", ps_vui->u4_def_disp_win_left_offset, ps_bitstrm);
+ UEV_PARSE("def_disp_win_right_offset", ps_vui->u4_def_disp_win_right_offset, ps_bitstrm);
+ UEV_PARSE("def_disp_win_top_offset", ps_vui->u4_def_disp_win_top_offset, ps_bitstrm);
+ UEV_PARSE("def_disp_win_bottom_offset", ps_vui->u4_def_disp_win_bottom_offset, ps_bitstrm);
+ }
+
+ BITS_PARSE("vui_timing_info_present_flag", ps_vui->u1_vui_timing_info_present_flag, ps_bitstrm, 1);
+ if(ps_vui->u1_vui_timing_info_present_flag)
+ {
+ BITS_PARSE("vui_num_units_in_tick", ps_vui->u4_vui_num_units_in_tick, ps_bitstrm, 32);
+ BITS_PARSE("vui_time_scale", ps_vui->u4_vui_time_scale, ps_bitstrm, 32);
+ BITS_PARSE("vui_poc_proportional_to_timing_flag", ps_vui->u1_poc_proportional_to_timing_flag, ps_bitstrm, 1);
+ if(ps_vui->u1_poc_proportional_to_timing_flag)
+ UEV_PARSE("vui_num_ticks_poc_diff_one_minus1", ps_vui->u1_num_ticks_poc_diff_one_minus1, ps_bitstrm);
+
+ BITS_PARSE("vui_hrd_parameters_present_flag", ps_vui->u1_vui_hrd_parameters_present_flag, ps_bitstrm, 1);
+ if(ps_vui->u1_vui_hrd_parameters_present_flag)
+ ihevcd_parse_hrd_parameters(ps_bitstrm, &ps_vui->s_vui_hrd_parameters, 1, sps_max_sub_layers_minus1);
+ }
+
+ BITS_PARSE("bitstream_restriction_flag", ps_vui->u1_bitstream_restriction_flag, ps_bitstrm, 1);
+ ps_vui->u1_tiles_fixed_structure_flag = 0;
+ ps_vui->u1_motion_vectors_over_pic_boundaries_flag = 1;
+ ps_vui->u1_restricted_ref_pic_lists_flag = 0;
+ ps_vui->u4_min_spatial_segmentation_idc = 0;
+ ps_vui->u1_max_bytes_per_pic_denom = 2;
+ ps_vui->u1_max_bits_per_mincu_denom = 1;
+ ps_vui->u1_log2_max_mv_length_horizontal = 15;
+ ps_vui->u1_log2_max_mv_length_vertical = 15;
+ if(ps_vui->u1_bitstream_restriction_flag)
+ {
+ BITS_PARSE("tiles_fixed_structure_flag", ps_vui->u1_tiles_fixed_structure_flag, ps_bitstrm, 1);
+ BITS_PARSE("motion_vectors_over_pic_boundaries_flag", ps_vui->u1_motion_vectors_over_pic_boundaries_flag, ps_bitstrm, 1);
+ BITS_PARSE("restricted_ref_pic_lists_flag", ps_vui->u1_restricted_ref_pic_lists_flag, ps_bitstrm, 1);
+
+ UEV_PARSE("min_spatial_segmentation_idc", ps_vui->u4_min_spatial_segmentation_idc, ps_bitstrm);
+ UEV_PARSE("max_bytes_per_pic_denom", ps_vui->u1_max_bytes_per_pic_denom, ps_bitstrm);
+ UEV_PARSE("max_bits_per_min_cu_denom", ps_vui->u1_max_bits_per_mincu_denom, ps_bitstrm);
+ UEV_PARSE("log2_max_mv_length_horizontal", ps_vui->u1_log2_max_mv_length_horizontal, ps_bitstrm);
+ UEV_PARSE("log2_max_mv_length_vertical", ps_vui->u1_log2_max_mv_length_vertical, ps_bitstrm);
+ }
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses profile tier and level info for either general layer of sub_layer
+*
+* @par Description
+* Parses profile tier and level info for either general layer of sub_layer
+* as per section 7.3.3
+*
+* Since the same function is called for parsing general_profile and
+* sub_layer_profile etc, variables do not specify whether the syntax is
+* for general or sub_layer. Similarly trace functions also do not differentiate
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[out] ps_ptl
+* Pointer to profile, tier level structure
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+static IHEVCD_ERROR_T ihevcd_parse_profile_tier_level_layer(bitstrm_t *ps_bitstrm,
+ profile_tier_lvl_t *ps_ptl)
+{
+ WORD32 value;
+ WORD32 i;
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+
+ BITS_PARSE("XXX_profile_space[]", value, ps_bitstrm, 2);
+ ps_ptl->i1_profile_space = value;
+
+ BITS_PARSE("XXX_tier_flag[]", value, ps_bitstrm, 1);
+ ps_ptl->i1_tier_flag = value;
+
+ BITS_PARSE("XXX_profile_idc[]", value, ps_bitstrm, 5);
+ ps_ptl->i1_profile_idc = value;
+
+ for(i = 0; i < MAX_PROFILE_COMPATBLTY; i++)
+ {
+ BITS_PARSE("XXX_profile_compatibility_flag[][j]", value, ps_bitstrm, 1);
+ ps_ptl->ai1_profile_compatibility_flag[i] = value;
+ }
+
+ BITS_PARSE("general_progressive_source_flag", value, ps_bitstrm, 1);
+ ps_ptl->i1_general_progressive_source_flag = value;
+
+ BITS_PARSE("general_interlaced_source_flag", value, ps_bitstrm, 1);
+ ps_ptl->i1_general_progressive_source_flag = value;
+
+ BITS_PARSE("general_non_packed_constraint_flag", value, ps_bitstrm, 1);
+ ps_ptl->i1_general_progressive_source_flag = value;
+
+ BITS_PARSE("general_frame_only_constraint_flag", value, ps_bitstrm, 1);
+ ps_ptl->i1_general_progressive_source_flag = value;
+
+ BITS_PARSE("XXX_reserved_zero_44bits[0..15]", value, ps_bitstrm, 16);
+
+ BITS_PARSE("XXX_reserved_zero_44bits[16..31]", value, ps_bitstrm, 16);
+
+ BITS_PARSE("XXX_reserved_zero_44bits[32..43]", value, ps_bitstrm, 12);
+ return ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses profile tier and level info
+*
+* @par Description
+* Parses profile tier and level info as per section 7.3.3
+* Called during VPS and SPS parsing
+* calls ihevcd_parse_profile_tier_level() for general layer and each sub_layers
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[out] ps_ptl
+* Pointer to structure that contains profile, tier level for each layers
+*
+* @param[in] profile_present
+* Flag to indicate if profile data is present
+*
+* @param[in] max_num_sub_layers
+* Number of sub layers present
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+static IHEVCD_ERROR_T ihevcd_profile_tier_level(bitstrm_t *ps_bitstrm,
+ profile_tier_lvl_info_t *ps_ptl,
+ WORD32 profile_present,
+ WORD32 max_num_sub_layers)
+{
+ WORD32 value;
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 i;
+
+ if(profile_present)
+ {
+ ret = ihevcd_parse_profile_tier_level_layer(ps_bitstrm, &ps_ptl->s_ptl_gen);
+ }
+
+ BITS_PARSE("general_level_idc", value, ps_bitstrm, 8);
+ ps_ptl->s_ptl_gen.u1_level_idc = value;
+
+
+ for(i = 0; i < max_num_sub_layers; i++)
+ {
+ BITS_PARSE("sub_layer_profile_present_flag[i]", value, ps_bitstrm, 1);
+ ps_ptl->ai1_sub_layer_profile_present_flag[i] = value;
+
+ BITS_PARSE("sub_layer_level_present_flag[i]", value, ps_bitstrm, 1);
+ ps_ptl->ai1_sub_layer_level_present_flag[i] = value;
+ }
+
+ if(max_num_sub_layers > 0)
+ {
+ for(i = max_num_sub_layers; i < 8; i++)
+ {
+ BITS_PARSE("reserved_zero_2bits", value, ps_bitstrm, 2);
+ }
+ }
+
+ for(i = 0; i < max_num_sub_layers; i++)
+ {
+ if(ps_ptl->ai1_sub_layer_profile_present_flag[i])
+ {
+ ret = ihevcd_parse_profile_tier_level_layer(ps_bitstrm,
+ &ps_ptl->as_ptl_sub[i]);
+ }
+ if(ps_ptl->ai1_sub_layer_level_present_flag[i])
+ {
+ BITS_PARSE("sub_layer_level_idc[i]", value, ps_bitstrm, 8);
+ ps_ptl->as_ptl_sub[i].u1_level_idc = value;
+
+ }
+ }
+
+
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses Scaling List Data syntax
+*
+* @par Description:
+* Parses Scaling List Data syntax as per Section: 7.3.6
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_scaling_list_data(codec_t *ps_codec, WORD16 *pi2_scaling_mat)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 size_id;
+ WORD32 matrix_id;
+ WORD32 value, dc_value = 0;
+ WORD32 next_coef;
+ WORD32 coef_num;
+ WORD32 i, j, offset;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD16 *pi2_scaling_mat_offset;
+ WORD32 scaling_mat_offset[] = { 0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992, 1248, 1504, 1760, 2016, 3040 };
+ UWORD8 *scan_table;
+
+ for(size_id = 0; size_id < 4; size_id++)
+ {
+ for(matrix_id = 0; matrix_id < ((size_id == 3) ? 2 : 6); matrix_id++)
+ {
+ WORD32 scaling_list_pred_mode_flag;
+ WORD32 scaling_list_delta_coef;
+ BITS_PARSE("scaling_list_pred_mode_flag", scaling_list_pred_mode_flag, ps_bitstrm, 1);
+
+ offset = size_id * 6 + matrix_id;
+ pi2_scaling_mat_offset = pi2_scaling_mat + scaling_mat_offset[offset];
+
+ if(!scaling_list_pred_mode_flag)
+ {
+ WORD32 num_elements;
+ UEV_PARSE("scaling_list_pred_matrix_id_delta", value,
+ ps_bitstrm);
+ value = CLIP3(value, 0, matrix_id);
+
+ num_elements = (1 << (4 + (size_id << 1)));
+ if(0 != value)
+ memcpy(pi2_scaling_mat_offset, pi2_scaling_mat_offset - value * num_elements, num_elements * sizeof(WORD16));
+ }
+ else
+ {
+ next_coef = 8;
+ coef_num = MIN(64, (1 << (4 + (size_id << 1))));
+
+ if(size_id > 1)
+ {
+ SEV_PARSE("scaling_list_dc_coef_minus8", value,
+ ps_bitstrm);
+
+ next_coef = value + 8;
+ dc_value = next_coef;
+ }
+ if(size_id < 2)
+ {
+ scan_table = (UWORD8 *)gapv_ihevc_invscan[size_id + 1];
+
+ for(i = 0; i < coef_num; i++)
+ {
+ SEV_PARSE("scaling_list_delta_coef",
+ scaling_list_delta_coef, ps_bitstrm);
+ next_coef = (next_coef + scaling_list_delta_coef + 256)
+ % 256;
+ pi2_scaling_mat_offset[scan_table[i]] = next_coef;
+ }
+ }
+ else if(size_id == 2)
+ {
+ scan_table = (UWORD8 *)gapv_ihevc_invscan[2];
+
+ for(i = 0; i < coef_num; i++)
+ {
+ SEV_PARSE("scaling_list_delta_coef",
+ scaling_list_delta_coef, ps_bitstrm);
+ next_coef = (next_coef + scaling_list_delta_coef + 256)
+ % 256;
+
+ offset = scan_table[i];
+ offset = (offset >> 3) * 16 * 2 + (offset & 0x7) * 2;
+ pi2_scaling_mat_offset[offset] = next_coef;
+ pi2_scaling_mat_offset[offset + 1] = next_coef;
+ pi2_scaling_mat_offset[offset + 16] = next_coef;
+ pi2_scaling_mat_offset[offset + 16 + 1] = next_coef;
+ }
+ pi2_scaling_mat_offset[0] = dc_value;
+ }
+ else
+ {
+ scan_table = (UWORD8 *)gapv_ihevc_invscan[2];
+
+ for(i = 0; i < coef_num; i++)
+ {
+ SEV_PARSE("scaling_list_delta_coef",
+ scaling_list_delta_coef, ps_bitstrm);
+ next_coef = (next_coef + scaling_list_delta_coef + 256)
+ % 256;
+
+ offset = scan_table[i];
+ offset = (offset >> 3) * 32 * 4 + (offset & 0x7) * 4;
+
+ for(j = 0; j < 4; j++)
+ {
+ pi2_scaling_mat_offset[offset + j * 32] = next_coef;
+ pi2_scaling_mat_offset[offset + 1 + j * 32] = next_coef;
+ pi2_scaling_mat_offset[offset + 2 + j * 32] = next_coef;
+ pi2_scaling_mat_offset[offset + 3 + j * 32] = next_coef;
+ }
+ pi2_scaling_mat_offset[0] = dc_value;
+ }
+ }
+ }
+ }
+ }
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses VPS (Video Parameter Set)
+*
+* @par Description:
+* Parse Video Parameter Set as per Section 7.3.2.1
+* update vps structure corresponding to vps ID
+* Till parsing VPS id, the elements are stored in local variables and are copied
+* later
+*
+* @param[in] ps_codec
+* Pointer to codec context.
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_vps(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 i;
+ WORD32 value;
+ WORD32 vps_id;
+ vps_t *ps_vps;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+#if 0
+ WORD32 j;
+#endif
+ BITS_PARSE("vps_video_parameter_set_id", value, ps_bitstrm, 4);
+ vps_id = value;
+
+ if(vps_id >= MAX_VPS_CNT)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_UNSUPPORTED_VPS_ID;
+ return IHEVCD_UNSUPPORTED_VPS_ID;
+ }
+
+
+ ps_vps = (ps_codec->s_parse.ps_vps_base + vps_id);
+
+ ps_vps->i1_vps_id = vps_id;
+
+ BITS_PARSE("vps_reserved_three_2bits", value, ps_bitstrm, 2);
+ ASSERT(value == 3);
+
+ BITS_PARSE("vps_max_layers_minus1", value, ps_bitstrm, 6);
+ //ps_vps->i1_vps_max_layers = value + 1;
+
+
+
+ BITS_PARSE("vps_max_sub_layers_minus1", value, ps_bitstrm, 3);
+ ps_vps->i1_vps_max_sub_layers = value + 1;
+
+ ASSERT(ps_vps->i1_vps_max_sub_layers < VPS_MAX_SUB_LAYERS);
+
+ BITS_PARSE("vps_temporal_id_nesting_flag", value, ps_bitstrm, 1);
+ ps_vps->i1_vps_temporal_id_nesting_flag = value;
+
+ BITS_PARSE("vps_reserved_ffff_16bits", value, ps_bitstrm, 16);
+ ASSERT(value == 0xFFFF);
+ // profile_and_level( 1, vps_max_sub_layers_minus1 )
+ ret = ihevcd_profile_tier_level(ps_bitstrm, &(ps_vps->s_ptl),
+ 1, (ps_vps->i1_vps_max_sub_layers - 1));
+
+ BITS_PARSE("vps_sub_layer_ordering_info_present_flag", value, ps_bitstrm, 1);
+ ps_vps->i1_sub_layer_ordering_info_present_flag = value;
+ i = (ps_vps->i1_sub_layer_ordering_info_present_flag ?
+ 0 : (ps_vps->i1_vps_max_sub_layers - 1));
+ for(; i < ps_vps->i1_vps_max_sub_layers; i++)
+ {
+ UEV_PARSE("vps_max_dec_pic_buffering[i]", value, ps_bitstrm);
+ ps_vps->ai1_vps_max_dec_pic_buffering[i] = value;
+
+ /* vps_num_reorder_pics (no max) used in print in order to match with HM */
+ UEV_PARSE("vps_num_reorder_pics[i]", value, ps_bitstrm);
+ ps_vps->ai1_vps_max_num_reorder_pics[i] = value;
+
+ UEV_PARSE("vps_max_latency_increase[i]", value, ps_bitstrm);
+ ps_vps->ai1_vps_max_latency_increase[i] = value;
+ }
+
+
+
+ BITS_PARSE("vps_max_layer_id", value, ps_bitstrm, 6);
+ //ps_vps->i1_vps_max_layer_id = value;
+
+ UEV_PARSE("vps_num_layer_sets_minus1", value, ps_bitstrm);
+ //ps_vps->i1_vps_num_layer_sets = value + 1;
+
+ BITS_PARSE("vps_timing_info_present_flag", value, ps_bitstrm, 1);
+ //ps_vps->i1_vps_timing_info_present_flag = value;
+
+
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses SPS (Sequence Parameter Set)
+* sequence_parameter_set_rbsp()
+*
+* @par Description:
+* Parse Sequence Parameter Set as per section Section: 7.3.2.2
+* The sps is written to a temporary buffer and copied later to the
+* appropriate location
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_sps(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+
+ WORD32 i;
+ WORD32 vps_id;
+ WORD32 sps_max_sub_layers;
+ WORD32 sps_id;
+ WORD32 sps_temporal_id_nesting_flag;
+ sps_t *ps_sps;
+ profile_tier_lvl_info_t s_ptl;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+
+
+ BITS_PARSE("video_parameter_set_id", value, ps_bitstrm, 4);
+ vps_id = value;
+ vps_id = CLIP3(vps_id, 0, MAX_VPS_CNT - 1);
+
+ BITS_PARSE("sps_max_sub_layers_minus1", value, ps_bitstrm, 3);
+ sps_max_sub_layers = value + 1;
+ sps_max_sub_layers = CLIP3(sps_max_sub_layers, 1, 7);
+
+ BITS_PARSE("sps_temporal_id_nesting_flag", value, ps_bitstrm, 1);
+ sps_temporal_id_nesting_flag = value;
+
+ //profile_and_level( 1, sps_max_sub_layers_minus1 )
+ ret = ihevcd_profile_tier_level(ps_bitstrm, &(s_ptl), 1,
+ (sps_max_sub_layers - 1));
+
+ UEV_PARSE("seq_parameter_set_id", value, ps_bitstrm);
+ sps_id = value;
+
+ if((sps_id >= MAX_SPS_CNT) || (sps_id < 0))
+ {
+ if(ps_codec->i4_sps_done)
+ return IHEVCD_UNSUPPORTED_SPS_ID;
+ else
+ sps_id = 0;
+ }
+
+
+ ps_sps = (ps_codec->s_parse.ps_sps_base + MAX_SPS_CNT - 1);
+ ps_sps->i1_sps_id = sps_id;
+ ps_sps->i1_vps_id = vps_id;
+ ps_sps->i1_sps_max_sub_layers = sps_max_sub_layers;
+ ps_sps->i1_sps_temporal_id_nesting_flag = sps_temporal_id_nesting_flag;
+ /* This is used only during initialization to get reorder count etc */
+ ps_codec->i4_sps_id = sps_id;
+ memcpy(&ps_sps->s_ptl, &s_ptl, sizeof(profile_tier_lvl_info_t));
+
+ UEV_PARSE("chroma_format_idc", value, ps_bitstrm);
+ ps_sps->i1_chroma_format_idc = value;
+
+ if(ps_sps->i1_chroma_format_idc != CHROMA_FMT_IDC_YUV420)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC;
+ return (IHEVCD_ERROR_T)IHEVCD_UNSUPPORTED_CHROMA_FMT_IDC;
+ }
+
+ if(CHROMA_FMT_IDC_YUV444_PLANES == ps_sps->i1_chroma_format_idc)
+ {
+ BITS_PARSE("separate_colour_plane_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_separate_colour_plane_flag = value;
+ }
+ else
+ {
+ ps_sps->i1_separate_colour_plane_flag = 0;
+ }
+
+ UEV_PARSE("pic_width_in_luma_samples", value, ps_bitstrm);
+ ps_sps->i2_pic_width_in_luma_samples = value;
+
+ UEV_PARSE("pic_height_in_luma_samples", value, ps_bitstrm);
+ ps_sps->i2_pic_height_in_luma_samples = value;
+
+ if((0 >= ps_sps->i2_pic_width_in_luma_samples) || (0 >= ps_sps->i2_pic_height_in_luma_samples))
+ return IHEVCD_INVALID_PARAMETER;
+
+ if((ps_sps->i2_pic_width_in_luma_samples > ps_codec->i4_max_wd) ||
+ (ps_sps->i2_pic_width_in_luma_samples * ps_sps->i2_pic_height_in_luma_samples >
+ ps_codec->i4_max_wd * ps_codec->i4_max_ht) ||
+ (ps_sps->i2_pic_height_in_luma_samples > MAX(ps_codec->i4_max_wd, ps_codec->i4_max_ht)))
+ {
+ return (IHEVCD_ERROR_T)IHEVCD_UNSUPPORTED_DIMENSIONS;
+ }
+
+ BITS_PARSE("pic_cropping_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_pic_cropping_flag = value;
+
+ if(ps_sps->i1_pic_cropping_flag)
+ {
+
+ UEV_PARSE("pic_crop_left_offset", value, ps_bitstrm);
+ ps_sps->i2_pic_crop_left_offset = value;
+
+ UEV_PARSE("pic_crop_right_offset", value, ps_bitstrm);
+ ps_sps->i2_pic_crop_right_offset = value;
+
+ UEV_PARSE("pic_crop_top_offset", value, ps_bitstrm);
+ ps_sps->i2_pic_crop_top_offset = value;
+
+ UEV_PARSE("pic_crop_bottom_offset", value, ps_bitstrm);
+ ps_sps->i2_pic_crop_bottom_offset = value;
+ }
+ else
+ {
+ ps_sps->i2_pic_crop_left_offset = 0;
+ ps_sps->i2_pic_crop_right_offset = 0;
+ ps_sps->i2_pic_crop_top_offset = 0;
+ ps_sps->i2_pic_crop_bottom_offset = 0;
+ }
+
+
+ UEV_PARSE("bit_depth_luma_minus8", value, ps_bitstrm);
+ if(0 != value)
+ return IHEVCD_UNSUPPORTED_BIT_DEPTH;
+
+ UEV_PARSE("bit_depth_chroma_minus8", value, ps_bitstrm);
+ if(0 != value)
+ return IHEVCD_UNSUPPORTED_BIT_DEPTH;
+
+ UEV_PARSE("log2_max_pic_order_cnt_lsb_minus4", value, ps_bitstrm);
+ ps_sps->i1_log2_max_pic_order_cnt_lsb = value + 4;
+
+ BITS_PARSE("sps_sub_layer_ordering_info_present_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_sps_sub_layer_ordering_info_present_flag = value;
+
+
+ i = (ps_sps->i1_sps_sub_layer_ordering_info_present_flag ? 0 : (ps_sps->i1_sps_max_sub_layers - 1));
+ for(; i < ps_sps->i1_sps_max_sub_layers; i++)
+ {
+ UEV_PARSE("max_dec_pic_buffering", value, ps_bitstrm);
+ ps_sps->ai1_sps_max_dec_pic_buffering[i] = value + 1;
+
+ UEV_PARSE("num_reorder_pics", value, ps_bitstrm);
+ ps_sps->ai1_sps_max_num_reorder_pics[i] = value;
+
+ UEV_PARSE("max_latency_increase", value, ps_bitstrm);
+ ps_sps->ai1_sps_max_latency_increase[i] = value;
+ }
+ UEV_PARSE("log2_min_coding_block_size_minus3", value, ps_bitstrm);
+ ps_sps->i1_log2_min_coding_block_size = value + 3;
+
+ UEV_PARSE("log2_diff_max_min_coding_block_size", value, ps_bitstrm);
+ ps_sps->i1_log2_diff_max_min_coding_block_size = value;
+
+ UEV_PARSE("log2_min_transform_block_size_minus2", value, ps_bitstrm);
+ ps_sps->i1_log2_min_transform_block_size = value + 2;
+
+ UEV_PARSE("log2_diff_max_min_transform_block_size", value, ps_bitstrm);
+ ps_sps->i1_log2_diff_max_min_transform_block_size = value;
+
+ ps_sps->i1_log2_max_transform_block_size = ps_sps->i1_log2_min_transform_block_size +
+ ps_sps->i1_log2_diff_max_min_transform_block_size;
+
+ ps_sps->i1_log2_ctb_size = ps_sps->i1_log2_min_coding_block_size +
+ ps_sps->i1_log2_diff_max_min_coding_block_size;
+
+ if((ps_sps->i1_log2_min_coding_block_size < 3) ||
+ (ps_sps->i1_log2_min_transform_block_size < 2) ||
+ (ps_sps->i1_log2_diff_max_min_transform_block_size < 0) ||
+ (ps_sps->i1_log2_max_transform_block_size > ps_sps->i1_log2_ctb_size) ||
+ (ps_sps->i1_log2_ctb_size < 4) ||
+ (ps_sps->i1_log2_ctb_size > 6))
+ {
+ return IHEVCD_INVALID_PARAMETER;
+ }
+
+ ps_sps->i1_log2_min_pcm_coding_block_size = 0;
+ ps_sps->i1_log2_diff_max_min_pcm_coding_block_size = 0;
+
+ UEV_PARSE("max_transform_hierarchy_depth_inter", value, ps_bitstrm);
+ ps_sps->i1_max_transform_hierarchy_depth_inter = value;
+
+ UEV_PARSE("max_transform_hierarchy_depth_intra", value, ps_bitstrm);
+ ps_sps->i1_max_transform_hierarchy_depth_intra = value;
+
+ /* String has a d (enabled) in order to match with HM */
+ BITS_PARSE("scaling_list_enabled_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_scaling_list_enable_flag = value;
+
+ if(ps_sps->i1_scaling_list_enable_flag)
+ {
+ COPY_DEFAULT_SCALING_LIST(ps_sps->pi2_scaling_mat);
+ BITS_PARSE("sps_scaling_list_data_present_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_sps_scaling_list_data_present_flag = value;
+
+ if(ps_sps->i1_sps_scaling_list_data_present_flag)
+ ihevcd_scaling_list_data(ps_codec, ps_sps->pi2_scaling_mat);
+ }
+ else
+ {
+ COPY_FLAT_SCALING_LIST(ps_sps->pi2_scaling_mat);
+ }
+ /* String is asymmetric_motion_partitions_enabled_flag instead of amp_enabled_flag in order to match with HM */
+ BITS_PARSE("asymmetric_motion_partitions_enabled_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_amp_enabled_flag = value;
+
+ BITS_PARSE("sample_adaptive_offset_enabled_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_sample_adaptive_offset_enabled_flag = value;
+
+ BITS_PARSE("pcm_enabled_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_pcm_enabled_flag = value;
+
+ if(ps_sps->i1_pcm_enabled_flag)
+ {
+ BITS_PARSE("pcm_sample_bit_depth_luma", value, ps_bitstrm, 4);
+ ps_sps->i1_pcm_sample_bit_depth_luma = value + 1;
+
+ BITS_PARSE("pcm_sample_bit_depth_chroma", value, ps_bitstrm, 4);
+ ps_sps->i1_pcm_sample_bit_depth_chroma = value + 1;
+
+ UEV_PARSE("log2_min_pcm_coding_block_size_minus3", value, ps_bitstrm);
+ ps_sps->i1_log2_min_pcm_coding_block_size = value + 3;
+
+ UEV_PARSE("log2_diff_max_min_pcm_coding_block_size", value, ps_bitstrm);
+ ps_sps->i1_log2_diff_max_min_pcm_coding_block_size = value;
+ BITS_PARSE("pcm_loop_filter_disable_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_pcm_loop_filter_disable_flag = value;
+
+ }
+ UEV_PARSE("num_short_term_ref_pic_sets", value, ps_bitstrm);
+ ps_sps->i1_num_short_term_ref_pic_sets = value;
+
+ ps_sps->i1_num_short_term_ref_pic_sets = CLIP3(ps_sps->i1_num_short_term_ref_pic_sets, 0, MAX_STREF_PICS_SPS);
+
+ for(i = 0; i < ps_sps->i1_num_short_term_ref_pic_sets; i++)
+ ihevcd_short_term_ref_pic_set(ps_bitstrm, &ps_sps->as_stref_picset[0], ps_sps->i1_num_short_term_ref_pic_sets, i, &ps_sps->as_stref_picset[i]);
+
+ BITS_PARSE("long_term_ref_pics_present_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_long_term_ref_pics_present_flag = value;
+
+ if(ps_sps->i1_long_term_ref_pics_present_flag)
+ {
+ UEV_PARSE("num_long_term_ref_pics_sps", value, ps_bitstrm);
+ ps_sps->i1_num_long_term_ref_pics_sps = value;
+
+ for(i = 0; i < ps_sps->i1_num_long_term_ref_pics_sps; i++)
+ {
+ BITS_PARSE("lt_ref_pic_poc_lsb_sps[ i ]", value, ps_bitstrm, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+ ps_sps->ai1_lt_ref_pic_poc_lsb_sps[i] = value;
+
+ BITS_PARSE("used_by_curr_pic_lt_sps_flag[ i ]", value, ps_bitstrm, 1);
+ ps_sps->ai1_used_by_curr_pic_lt_sps_flag[i] = value;
+ }
+ }
+
+ BITS_PARSE("sps_temporal_mvp_enable_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_sps_temporal_mvp_enable_flag = value;
+
+ /* Print matches HM 8-2 */
+ BITS_PARSE("sps_strong_intra_smoothing_enable_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_strong_intra_smoothing_enable_flag = value;
+
+ BITS_PARSE("vui_parameters_present_flag", value, ps_bitstrm, 1);
+ ps_sps->i1_vui_parameters_present_flag = value;
+
+ if(ps_sps->i1_vui_parameters_present_flag)
+ ihevcd_parse_vui_parameters(ps_bitstrm,
+ &ps_sps->s_vui_parameters,
+ ps_sps->i1_sps_max_sub_layers - 1);
+
+ BITS_PARSE("sps_extension_flag", value, ps_bitstrm, 1);
+
+
+ {
+ WORD32 numerator;
+ WORD32 ceil_offset;
+
+ ceil_offset = (1 << ps_sps->i1_log2_ctb_size) - 1;
+ numerator = ps_sps->i2_pic_width_in_luma_samples;
+
+ ps_sps->i2_pic_wd_in_ctb = ((numerator + ceil_offset) /
+ (1 << ps_sps->i1_log2_ctb_size));
+
+ numerator = ps_sps->i2_pic_height_in_luma_samples;
+ ps_sps->i2_pic_ht_in_ctb = ((numerator + ceil_offset) /
+ (1 << ps_sps->i1_log2_ctb_size));
+
+ ps_sps->i4_pic_size_in_ctb = ps_sps->i2_pic_ht_in_ctb *
+ ps_sps->i2_pic_wd_in_ctb;
+
+ if(0 == ps_codec->i4_sps_done)
+ ps_codec->s_parse.i4_next_ctb_indx = ps_sps->i4_pic_size_in_ctb;
+
+ numerator = ps_sps->i2_pic_width_in_luma_samples;
+ ps_sps->i2_pic_wd_in_min_cb = numerator /
+ (1 << ps_sps->i1_log2_min_coding_block_size);
+
+ numerator = ps_sps->i2_pic_height_in_luma_samples;
+ ps_sps->i2_pic_ht_in_min_cb = numerator /
+ (1 << ps_sps->i1_log2_min_coding_block_size);
+ }
+ if((0 != ps_codec->i4_first_pic_done) &&
+ ((ps_codec->i4_wd != ps_sps->i2_pic_width_in_luma_samples) ||
+ (ps_codec->i4_ht != ps_sps->i2_pic_height_in_luma_samples)))
+ {
+ ps_codec->i4_reset_flag = 1;
+ ps_codec->i4_error_code = IVD_RES_CHANGED;
+ return (IHEVCD_ERROR_T)IHEVCD_FAIL;
+ }
+
+ /* Update display width and display height */
+ {
+ WORD32 disp_wd, disp_ht;
+ WORD32 crop_unit_x, crop_unit_y;
+ crop_unit_x = 1;
+ crop_unit_y = 1;
+
+ if(CHROMA_FMT_IDC_YUV420 == ps_sps->i1_chroma_format_idc)
+ {
+ crop_unit_x = 2;
+ crop_unit_y = 2;
+ }
+
+ disp_wd = ps_sps->i2_pic_width_in_luma_samples;
+ disp_wd -= ps_sps->i2_pic_crop_left_offset * crop_unit_x;
+ disp_wd -= ps_sps->i2_pic_crop_right_offset * crop_unit_x;
+
+
+ disp_ht = ps_sps->i2_pic_height_in_luma_samples;
+ disp_ht -= ps_sps->i2_pic_crop_top_offset * crop_unit_y;
+ disp_ht -= ps_sps->i2_pic_crop_bottom_offset * crop_unit_y;
+
+ if((0 >= disp_wd) || (0 >= disp_ht))
+ return IHEVCD_INVALID_PARAMETER;
+
+ ps_codec->i4_disp_wd = disp_wd;
+ ps_codec->i4_disp_ht = disp_ht;
+
+
+ ps_codec->i4_wd = ps_sps->i2_pic_width_in_luma_samples;
+ ps_codec->i4_ht = ps_sps->i2_pic_height_in_luma_samples;
+
+ {
+ WORD32 ref_strd;
+ ref_strd = ALIGN32(ps_sps->i2_pic_width_in_luma_samples + PAD_WD);
+ if(ps_codec->i4_strd < ref_strd)
+ {
+ ps_codec->i4_strd = ref_strd;
+ }
+ }
+
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ if(ps_codec->i4_disp_strd < ps_codec->i4_disp_wd)
+ {
+ ps_codec->i4_disp_strd = ps_codec->i4_disp_wd;
+ }
+ }
+ else
+ {
+ if(ps_codec->i4_disp_strd < ps_codec->i4_strd)
+ {
+ ps_codec->i4_disp_strd = ps_codec->i4_strd;
+ }
+ }
+ }
+
+ ps_codec->i4_sps_done = 1;
+ return ret;
+}
+
+
+void ihevcd_unmark_pps(codec_t *ps_codec, WORD32 sps_id)
+{
+ WORD32 pps_id = 0;
+ pps_t *ps_pps = ps_codec->ps_pps_base;
+
+ for(pps_id = 0; pps_id < MAX_PPS_CNT - 1; pps_id++, ps_pps++)
+ {
+ if((ps_pps->i1_pps_valid) &&
+ (ps_pps->i1_sps_id == sps_id))
+ ps_pps->i1_pps_valid = 0;
+ }
+}
+
+
+void ihevcd_copy_sps(codec_t *ps_codec, WORD32 sps_id, WORD32 sps_id_ref)
+{
+ sps_t *ps_sps, *ps_sps_ref;
+ WORD16 *pi2_scaling_mat_backup;
+ WORD32 scaling_mat_size;
+
+ SCALING_MAT_SIZE(scaling_mat_size);
+ ps_sps_ref = ps_codec->ps_sps_base + sps_id_ref;
+ ps_sps = ps_codec->ps_sps_base + sps_id;
+
+ if(ps_sps->i1_sps_valid)
+ {
+ if((ps_sps->i1_log2_ctb_size != ps_sps_ref->i1_log2_ctb_size) ||
+ (ps_sps->i2_pic_wd_in_ctb != ps_sps_ref->i2_pic_wd_in_ctb) ||
+ (ps_sps->i2_pic_ht_in_ctb != ps_sps_ref->i2_pic_ht_in_ctb))
+ {
+ ihevcd_unmark_pps(ps_codec, sps_id);
+ }
+ }
+
+ pi2_scaling_mat_backup = ps_sps->pi2_scaling_mat;
+
+ memcpy(ps_sps, ps_sps_ref, sizeof(sps_t));
+ ps_sps->pi2_scaling_mat = pi2_scaling_mat_backup;
+ memcpy(ps_sps->pi2_scaling_mat, ps_sps_ref->pi2_scaling_mat, scaling_mat_size * sizeof(WORD16));
+ ps_sps->i1_sps_valid = 1;
+
+ ps_codec->s_parse.ps_sps = ps_sps;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses PPS (Picture Parameter Set)
+*
+* @par Description:
+* Parse Picture Parameter Set as per section Section: 7.3.2.3
+* The pps is written to a temporary buffer and copied later to the
+* appropriate location
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_pps(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+ WORD32 pps_id;
+
+ pps_t *ps_pps;
+ sps_t *ps_sps;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+
+
+ if(0 == ps_codec->i4_sps_done)
+ return IHEVCD_INVALID_HEADER;
+
+ UEV_PARSE("pic_parameter_set_id", value, ps_bitstrm);
+
+ pps_id = value;
+ if((pps_id >= MAX_PPS_CNT) || (pps_id < 0))
+ {
+ if(ps_codec->i4_pps_done)
+ return IHEVCD_UNSUPPORTED_PPS_ID;
+ else
+ pps_id = 0;
+ }
+
+
+ ps_pps = (ps_codec->s_parse.ps_pps_base + MAX_PPS_CNT - 1);
+
+ ps_pps->i1_pps_id = pps_id;
+
+ UEV_PARSE("seq_parameter_set_id", value, ps_bitstrm);
+ ps_pps->i1_sps_id = value;
+ ps_pps->i1_sps_id = CLIP3(ps_pps->i1_sps_id, 0, MAX_SPS_CNT - 2);
+
+ ps_sps = (ps_codec->s_parse.ps_sps_base + ps_pps->i1_sps_id);
+
+ /* If the SPS that is being referred to has not been parsed,
+ * copy an existing SPS to the current location */
+ if(0 == ps_sps->i1_sps_valid)
+ {
+ return IHEVCD_INVALID_HEADER;
+
+/*
+ sps_t *ps_sps_ref = ps_codec->ps_sps_base;
+ while(0 == ps_sps_ref->i1_sps_valid)
+ ps_sps_ref++;
+ ihevcd_copy_sps(ps_codec, ps_pps->i1_sps_id, ps_sps_ref->i1_sps_id);
+*/
+ }
+
+ BITS_PARSE("dependent_slices_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_dependent_slice_enabled_flag = value;
+
+ BITS_PARSE("output_flag_present_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_output_flag_present_flag = value;
+
+ BITS_PARSE("num_extra_slice_header_bits", value, ps_bitstrm, 3);
+ ps_pps->i1_num_extra_slice_header_bits = value;
+
+
+ BITS_PARSE("sign_data_hiding_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_sign_data_hiding_flag = value;
+
+ BITS_PARSE("cabac_init_present_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_cabac_init_present_flag = value;
+
+ UEV_PARSE("num_ref_idx_l0_default_active_minus1", value, ps_bitstrm);
+ ps_pps->i1_num_ref_idx_l0_default_active = value + 1;
+
+ UEV_PARSE("num_ref_idx_l1_default_active_minus1", value, ps_bitstrm);
+ ps_pps->i1_num_ref_idx_l1_default_active = value + 1;
+
+ SEV_PARSE("pic_init_qp_minus26", value, ps_bitstrm);
+ ps_pps->i1_pic_init_qp = value + 26;
+
+ BITS_PARSE("constrained_intra_pred_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_constrained_intra_pred_flag = value;
+
+ BITS_PARSE("transform_skip_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_transform_skip_enabled_flag = value;
+
+ BITS_PARSE("cu_qp_delta_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_cu_qp_delta_enabled_flag = value;
+
+ if(ps_pps->i1_cu_qp_delta_enabled_flag)
+ {
+ UEV_PARSE("diff_cu_qp_delta_depth", value, ps_bitstrm);
+ ps_pps->i1_diff_cu_qp_delta_depth = value;
+ }
+ else
+ {
+ ps_pps->i1_diff_cu_qp_delta_depth = 0;
+ }
+ ps_pps->i1_log2_min_cu_qp_delta_size = ps_sps->i1_log2_ctb_size - ps_pps->i1_diff_cu_qp_delta_depth;
+ /* Print different */
+ SEV_PARSE("cb_qp_offset", value, ps_bitstrm);
+ ps_pps->i1_pic_cb_qp_offset = value;
+
+ /* Print different */
+ SEV_PARSE("cr_qp_offset", value, ps_bitstrm);
+ ps_pps->i1_pic_cr_qp_offset = value;
+
+ /* Print different */
+ BITS_PARSE("slicelevel_chroma_qp_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_pic_slice_level_chroma_qp_offsets_present_flag = value;
+
+ BITS_PARSE("weighted_pred_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_weighted_pred_flag = value;
+
+ BITS_PARSE("weighted_bipred_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_weighted_bipred_flag = value;
+
+ BITS_PARSE("transquant_bypass_enable_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_transquant_bypass_enable_flag = value;
+
+ BITS_PARSE("tiles_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_tiles_enabled_flag = value;
+
+ BITS_PARSE("entropy_coding_sync_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_entropy_coding_sync_enabled_flag = value;
+
+ ps_pps->i1_loop_filter_across_tiles_enabled_flag = 0;
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ UEV_PARSE("num_tile_columns_minus1", value, ps_bitstrm);
+ ps_pps->i1_num_tile_columns = value + 1;
+
+ UEV_PARSE("num_tile_rows_minus1", value, ps_bitstrm);
+ ps_pps->i1_num_tile_rows = value + 1;
+
+ if((ps_pps->i1_num_tile_columns < 1) ||
+ (ps_pps->i1_num_tile_columns > ps_sps->i2_pic_wd_in_ctb) ||
+ (ps_pps->i1_num_tile_rows < 1) ||
+ (ps_pps->i1_num_tile_rows > ps_sps->i2_pic_ht_in_ctb))
+ return IHEVCD_INVALID_HEADER;
+
+ BITS_PARSE("uniform_spacing_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_uniform_spacing_flag = value;
+
+
+ {
+
+ WORD32 start;
+ WORD32 i, j;
+
+
+ start = 0;
+ for(i = 0; i < ps_pps->i1_num_tile_columns; i++)
+ {
+ tile_t *ps_tile;
+ if(!ps_pps->i1_uniform_spacing_flag)
+ {
+ if(i < (ps_pps->i1_num_tile_columns - 1))
+ {
+ UEV_PARSE("column_width_minus1[ i ]", value, ps_bitstrm);
+ value += 1;
+ }
+ else
+ {
+ value = ps_sps->i2_pic_wd_in_ctb - start;
+ }
+ }
+ else
+ {
+ value = ((i + 1) * ps_sps->i2_pic_wd_in_ctb) / ps_pps->i1_num_tile_columns -
+ (i * ps_sps->i2_pic_wd_in_ctb) / ps_pps->i1_num_tile_columns;
+ }
+
+ for(j = 0; j < ps_pps->i1_num_tile_rows; j++)
+ {
+ ps_tile = ps_pps->ps_tile + j * ps_pps->i1_num_tile_columns + i;
+ ps_tile->u1_pos_x = start;
+ ps_tile->u2_wd = value;
+ }
+ start += value;
+
+ if((start > ps_sps->i2_pic_wd_in_ctb) ||
+ (value <= 0))
+ return IHEVCD_INVALID_HEADER;
+ }
+
+ start = 0;
+ for(i = 0; i < (ps_pps->i1_num_tile_rows); i++)
+ {
+ tile_t *ps_tile;
+ if(!ps_pps->i1_uniform_spacing_flag)
+ {
+ if(i < (ps_pps->i1_num_tile_rows - 1))
+ {
+
+ UEV_PARSE("row_height_minus1[ i ]", value, ps_bitstrm);
+ value += 1;
+ }
+ else
+ {
+ value = ps_sps->i2_pic_ht_in_ctb - start;
+ }
+ }
+ else
+ {
+ value = ((i + 1) * ps_sps->i2_pic_ht_in_ctb) / ps_pps->i1_num_tile_rows -
+ (i * ps_sps->i2_pic_ht_in_ctb) / ps_pps->i1_num_tile_rows;
+ }
+
+ for(j = 0; j < ps_pps->i1_num_tile_columns; j++)
+ {
+ ps_tile = ps_pps->ps_tile + i * ps_pps->i1_num_tile_columns + j;
+ ps_tile->u1_pos_y = start;
+ ps_tile->u2_ht = value;
+ }
+ start += value;
+
+ if((start > ps_sps->i2_pic_ht_in_ctb) ||
+ (value <= 0))
+ return IHEVCD_INVALID_HEADER;
+ }
+ }
+
+
+ BITS_PARSE("loop_filter_across_tiles_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_loop_filter_across_tiles_enabled_flag = value;
+
+ }
+ else
+ {
+ /* If tiles are not present, set first tile in each PPS to have tile
+ width and height equal to picture width and height */
+ ps_pps->i1_num_tile_columns = 1;
+ ps_pps->i1_num_tile_rows = 1;
+ ps_pps->i1_uniform_spacing_flag = 1;
+
+ ps_pps->ps_tile->u1_pos_x = 0;
+ ps_pps->ps_tile->u1_pos_y = 0;
+ ps_pps->ps_tile->u2_wd = ps_sps->i2_pic_wd_in_ctb;
+ ps_pps->ps_tile->u2_ht = ps_sps->i2_pic_ht_in_ctb;
+ }
+
+ BITS_PARSE("loop_filter_across_slices_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_loop_filter_across_slices_enabled_flag = value;
+
+ BITS_PARSE("deblocking_filter_control_present_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_deblocking_filter_control_present_flag = value;
+
+ /* Default values */
+ ps_pps->i1_pic_disable_deblocking_filter_flag = 0;
+ ps_pps->i1_deblocking_filter_override_enabled_flag = 0;
+ ps_pps->i1_beta_offset_div2 = 0;
+ ps_pps->i1_tc_offset_div2 = 0;
+
+ if(ps_pps->i1_deblocking_filter_control_present_flag)
+ {
+
+ BITS_PARSE("deblocking_filter_override_enabled_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_deblocking_filter_override_enabled_flag = value;
+
+ BITS_PARSE("pic_disable_deblocking_filter_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_pic_disable_deblocking_filter_flag = value;
+
+ if(!ps_pps->i1_pic_disable_deblocking_filter_flag)
+ {
+
+ SEV_PARSE("pps_beta_offset_div2", value, ps_bitstrm);
+ ps_pps->i1_beta_offset_div2 = value;
+
+ SEV_PARSE("pps_tc_offset_div2", value, ps_bitstrm);
+ ps_pps->i1_tc_offset_div2 = value;
+
+ }
+ }
+
+ BITS_PARSE("pps_scaling_list_data_present_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_pps_scaling_list_data_present_flag = value;
+
+ if(ps_pps->i1_pps_scaling_list_data_present_flag)
+ {
+ COPY_DEFAULT_SCALING_LIST(ps_pps->pi2_scaling_mat);
+ ihevcd_scaling_list_data(ps_codec, ps_pps->pi2_scaling_mat);
+ }
+
+ BITS_PARSE("lists_modification_present_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_lists_modification_present_flag = value;
+ UEV_PARSE("log2_parallel_merge_level_minus2", value, ps_bitstrm);
+ ps_pps->i1_log2_parallel_merge_level = value + 2;
+
+ BITS_PARSE("slice_header_extension_present_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_slice_header_extension_present_flag = value;
+ /* Not present in HM */
+#if 0
+ BITS_PARSE("slice_extension_present_flag", value, ps_bitstrm, 1);
+ ps_pps->i1_slice_extension_present_flag = value;
+#endif
+ BITS_PARSE("pps_extension_flag", value, ps_bitstrm, 1);
+
+ ps_codec->i4_pps_done = 1;
+ return ret;
+}
+
+
+void ihevcd_copy_pps(codec_t *ps_codec, WORD32 pps_id, WORD32 pps_id_ref)
+{
+ pps_t *ps_pps, *ps_pps_ref;
+ WORD16 *pi2_scaling_mat_backup;
+ WORD32 scaling_mat_size;
+ tile_t *ps_tile_backup;
+ WORD32 max_tile_cols, max_tile_rows;
+
+ SCALING_MAT_SIZE(scaling_mat_size);
+ max_tile_cols = (ps_codec->i4_max_wd + MIN_TILE_WD - 1) / MIN_TILE_WD;
+ max_tile_rows = (ps_codec->i4_max_ht + MIN_TILE_HT - 1) / MIN_TILE_HT;
+
+ ps_pps_ref = ps_codec->ps_pps_base + pps_id_ref;
+ ps_pps = ps_codec->ps_pps_base + pps_id;
+
+ pi2_scaling_mat_backup = ps_pps->pi2_scaling_mat;
+ ps_tile_backup = ps_pps->ps_tile;
+
+ memcpy(ps_pps, ps_pps_ref, sizeof(pps_t));
+ ps_pps->pi2_scaling_mat = pi2_scaling_mat_backup;
+ ps_pps->ps_tile = ps_tile_backup;
+ memcpy(ps_pps->pi2_scaling_mat, ps_pps_ref->pi2_scaling_mat, scaling_mat_size * sizeof(WORD16));
+ memcpy(ps_pps->ps_tile, ps_pps_ref->ps_tile, max_tile_cols * max_tile_rows * sizeof(tile_t));
+
+ ps_pps->i1_pps_valid = 1;
+
+ ps_codec->s_parse.ps_pps = ps_pps;
+}
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses SEI (Supplemental Enhancement Information)
+*
+* @par Description:
+* Parses SEI (Supplemental Enhancement Information) as per Section: 7.3.7
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_sei(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ UNUSED(ps_codec);
+#if 0
+
+ sei_message( )
+ {
+ payloadType = 0
+ while( next_bits(8) == 0xFF )
+ {
+ ff_byte /* equal to 0xFF */
+ payloadType += 255
+ }
+
+ BITS_PARSE("last_payload_type_byte", value, ps_bitstrm, 1);
+ ps_sei->i1_last_payload_type_byte = value;
+
+ payloadType += last_payload_type_byte
+ payloadSize = 0
+ while(next_bits(8) == 0xFF)
+ {
+ ff_byte /* equal to 0xFF */
+ payloadSize += 255
+ }
+
+ BITS_PARSE("last_payload_size_byte", value, ps_bitstrm, 1);
+ ps_sei->i1_last_payload_size_byte = value;
+
+ payloadSize += last_payload_size_byte
+ sei_payload( payloadType, payloadSize )
+ }
+
+#endif
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses Access unit delimiter
+*
+* @par Description:
+* Parses Access unit delimiter as per section Section: 7.3.2.5
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_parse_aud(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ UNUSED(ps_codec);
+#if 0
+
+ access_unit_delimiter_rbsp( )
+ {
+
+ BITS_PARSE("pic_type", value, ps_bitstrm, 3);
+ ps_sei->i1_pic_type = value;
+
+ rbsp_trailing_bits( )
+ }
+
+
+#endif
+ return ret;
+}
+
+WORD32 ihevcd_extend_sign_bit(WORD32 value, WORD32 num_bits)
+{
+ WORD32 ret_value = value;
+ if(value >> (num_bits - 1))
+ {
+ ret_value |= (0xFFFFFFFF << num_bits);
+ }
+ return ret_value;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Calculate POC of the current slice
+*
+* @par Description:
+* Calculates the current POC using the previous POC lsb and previous POC msb
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @param[in] i1_pic_order_cnt_lsb
+* Current POC lsb
+*
+* @returns Current absolute POC
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_calc_poc(codec_t *ps_codec, nal_header_t *ps_nal, WORD8 i1_log2_max_poc_lsb, WORD32 i2_poc_lsb)
+{
+ WORD32 i4_abs_poc, i4_poc_msb;
+ WORD32 max_poc_lsb;
+ WORD8 i1_nal_unit_type = ps_nal->i1_nal_unit_type;
+ max_poc_lsb = (1 << i1_log2_max_poc_lsb);
+
+ if((!ps_codec->i4_first_pic_done) && (!ps_codec->i4_pic_present))
+ ps_codec->i4_prev_poc_msb = -2 * max_poc_lsb;
+
+ if(NAL_IDR_N_LP == i1_nal_unit_type
+ || NAL_IDR_W_LP == i1_nal_unit_type
+ || NAL_BLA_N_LP == i1_nal_unit_type
+ || NAL_BLA_W_DLP == i1_nal_unit_type
+ || NAL_BLA_W_LP == i1_nal_unit_type
+ || (NAL_CRA == i1_nal_unit_type && !ps_codec->i4_first_pic_done))
+ {
+ i4_poc_msb = ps_codec->i4_prev_poc_msb + 2 * max_poc_lsb;
+ ps_codec->i4_prev_poc_lsb = 0;
+ ps_codec->i4_max_prev_poc_lsb = 0;
+// ps_codec->i4_prev_poc_msb = 0;
+ }
+ else
+ {
+
+ if((i2_poc_lsb < ps_codec->i4_prev_poc_lsb)
+ && ((ps_codec->i4_prev_poc_lsb - i2_poc_lsb) >= max_poc_lsb / 2))
+ {
+ i4_poc_msb = ps_codec->i4_prev_poc_msb + max_poc_lsb;
+ }
+ else if((i2_poc_lsb > ps_codec->i4_prev_poc_lsb)
+ && ((i2_poc_lsb - ps_codec->i4_prev_poc_lsb) > max_poc_lsb / 2))
+ {
+ i4_poc_msb = ps_codec->i4_prev_poc_msb - max_poc_lsb;
+ }
+ else
+ {
+ i4_poc_msb = ps_codec->i4_prev_poc_msb;
+ }
+
+
+ }
+
+ i4_abs_poc = i4_poc_msb + i2_poc_lsb;
+ ps_codec->i4_max_prev_poc_lsb = MAX(ps_codec->i4_max_prev_poc_lsb, i2_poc_lsb);
+
+ {
+ WORD32 is_reference_nal = ((i1_nal_unit_type <= NAL_RSV_VCL_R15) && (i1_nal_unit_type % 2 != 0)) || ((i1_nal_unit_type >= NAL_BLA_W_LP) && (i1_nal_unit_type <= NAL_RSV_RAP_VCL23));
+ WORD32 update_prev_poc = ((is_reference_nal) && ((i1_nal_unit_type < NAL_RADL_N) || (i1_nal_unit_type > NAL_RASL_R)));
+
+ if((0 == ps_nal->i1_nuh_temporal_id) &&
+ (update_prev_poc))
+ {
+ ps_codec->i4_prev_poc_lsb = i2_poc_lsb;
+ ps_codec->i4_prev_poc_msb = i4_poc_msb;
+ }
+ }
+
+ return i4_abs_poc;
+}
+
+
+void ihevcd_copy_slice_hdr(codec_t *ps_codec, WORD32 slice_idx, WORD32 slice_idx_ref)
+{
+ slice_header_t *ps_slice_hdr, *ps_slice_hdr_ref;
+ WORD32 *pu4_entry_offset_backup;
+
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + slice_idx;
+ ps_slice_hdr_ref = ps_codec->s_parse.ps_slice_hdr_base + slice_idx_ref;
+
+ pu4_entry_offset_backup = ps_slice_hdr->pu4_entry_point_offset;
+ memcpy(ps_slice_hdr, ps_slice_hdr_ref, sizeof(slice_header_t));
+ ps_slice_hdr->pu4_entry_point_offset = pu4_entry_offset_backup;
+}
+
+
+
diff --git a/decoder/ihevcd_parse_headers.h b/decoder/ihevcd_parse_headers.h
new file mode 100644
index 0000000..2139f64
--- /dev/null
+++ b/decoder/ihevcd_parse_headers.h
@@ -0,0 +1,48 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_headers.h
+*
+* @brief
+* Parsing of various headers like VPS, SPS, PPS etc
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_HEADERS_H_
+#define _IHEVCD_PARSE_HEADERS_H_
+
+void ihevcd_copy_sps(codec_t *ps_codec, WORD32 sps_id, WORD32 sps_id_ref);
+void ihevcd_copy_pps(codec_t *ps_codec, WORD32 pps_id, WORD32 pps_id_ref);
+void ihevcd_copy_slice_hdr(codec_t *ps_codec, WORD32 slice_idx, WORD32 slice_idx_ref);
+
+IHEVCD_ERROR_T ihevcd_parse_vps(codec_t *ps_codec);
+IHEVCD_ERROR_T ihevcd_parse_sps(codec_t *ps_codec);
+IHEVCD_ERROR_T ihevcd_parse_pps(codec_t *ps_codec);
+IHEVCD_ERROR_T ihevcd_parse_slice_header(codec_t *ps_codec,
+ nal_header_t *ps_nal);
+
+#endif /* _IHEVCD_PARSE_HEADERS_H_ */
diff --git a/decoder/ihevcd_parse_residual.c b/decoder/ihevcd_parse_residual.c
new file mode 100644
index 0000000..fc84fa3
--- /dev/null
+++ b/decoder/ihevcd_parse_residual.c
@@ -0,0 +1,905 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_residual.c
+*
+* @brief
+* Contains functions for parsing residual data at TU level
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_error.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_parse_residual.h"
+#include "ihevcd_cabac.h"
+
+/**
+ *****************************************************************************
+ * @brief returns context increment for sig coeff based on csbf neigbour
+ * flags (bottom and right) and current coeff postion in 4x4 block
+ * See section 9.3.3.1.4 for details on this context increment
+ *
+ * input : neigbour csbf flags(bit0:rightcsbf, bit1:bottom csbf)
+ * coeff idx in raster order (0-15)
+ *
+ * output : context increment for sig coeff flag
+ *
+ *****************************************************************************
+ */
+const UWORD8 gau1_ihevcd_sigcoeff_ctxtinc[3][4][16] =
+{
+
+ {
+ /* nbr csbf = 0: sigCtx = (xP+yP == 0) ? 2 : (xP+yP < 3) ? 1: 0 */
+ { 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ /* nbr csbf = 1: sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1: 0 */
+ { 2, 1, 2, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0 },
+ /* nbr csbf = 2: sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1: 0 */
+ { 2, 2, 1, 2, 1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0 },
+ /* nbr csbf = 3: sigCtx = 2 */
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ },
+ {
+ /* nbr csbf = 0: sigCtx = (xP+yP == 0) ? 2 : (xP+yP < 3) ? 1: 0 */
+ { 2, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 },
+ /* nbr csbf = 1: sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1: 0 */
+ { 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+ /* nbr csbf = 2: sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1: 0 */
+ { 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0 },
+ /* nbr csbf = 3: sigCtx = 2 */
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ },
+ {
+ /* nbr csbf = 0: sigCtx = (xP+yP == 0) ? 2 : (xP+yP < 3) ? 1: 0 */
+ { 2, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 },
+ /* nbr csbf = 1: sigCtx = (yP == 0) ? 2 : (yP == 1) ? 1: 0 */
+ { 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0 },
+ /* nbr csbf = 2: sigCtx = (xP == 0) ? 2 : (xP == 1) ? 1: 0 */
+ { 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+ /* nbr csbf = 3: sigCtx = 2 */
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ },
+
+
+};
+
+
+
+/**
+ *****************************************************************************
+ * @brief returns context increment for sig coeff for 4x4 tranform size as
+ * per Table 9-39 in section 9.3.3.1.4
+ *
+ * input : coeff idx in raster order (0-15)
+ *
+ * output : context increment for sig coeff flag
+ *
+ *****************************************************************************
+ */
+const UWORD8 gau1_ihevcd_sigcoeff_ctxtinc_tr4[3][16] =
+{
+ /* Upright diagonal scan */
+ {
+ 0, 2, 1, 6,
+ 3, 4, 7, 6,
+ 4, 5, 7, 8,
+ 5, 8, 8, 8,
+ },
+ /* Horizontal scan */
+ {
+ 0, 1, 4, 5,
+ 2, 3, 4, 5,
+ 6, 6, 8, 8,
+ 7, 7, 8, 8,
+ },
+ /* Vertical scan */
+ {
+ 0, 2, 6, 7,
+ 1, 3, 6, 7,
+ 4, 4, 8, 8,
+ 5, 5, 8, 8,
+ },
+};
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses Residual coding
+*
+* @par Description:
+* Parses Residual coding as per Section:7.3.13
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_parse_residual_coding(codec_t *ps_codec,
+ WORD32 x0, WORD32 y0,
+ WORD32 log2_trafo_size,
+ WORD32 c_idx,
+ WORD32 intra_pred_mode)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 transform_skip_flag;
+ WORD32 value;
+ pps_t *ps_pps;
+ WORD32 last_scan_pos, last_sub_blk;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD32 last_significant_coeff_x_prefix, last_significant_coeff_y_prefix;
+ WORD32 last_significant_coeff_x, last_significant_coeff_y;
+ const UWORD8 *pu1_scan_blk, *pu1_scan_coeff;
+ WORD32 scan_idx;
+ WORD32 i;
+ WORD32 sign_data_hiding_flag;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+ WORD32 gt1_ctxt = 1;
+ WORD32 c_max;
+ UWORD16 au2_csbf[9];
+ tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+ WORD8 *pi1_num_coded_subblks;
+ WORD32 num_subblks;
+ WORD32 sig_coeff_base_ctxt, abs_gt1_base_ctxt;
+ UNUSED(x0);
+ UNUSED(y0);
+ ps_pps = ps_codec->s_parse.ps_pps;
+
+ sign_data_hiding_flag = ps_pps->i1_sign_data_hiding_flag;
+ transform_skip_flag = 0;
+ if(ps_pps->i1_transform_skip_enabled_flag &&
+ !ps_codec->s_parse.s_cu.i4_cu_transquant_bypass &&
+ (log2_trafo_size == 2))
+ {
+ WORD32 ctxt_idx;
+
+ if(!c_idx)
+ {
+ ctxt_idx = IHEVC_CAB_TFM_SKIP0;
+ }
+ else
+ {
+ ctxt_idx = IHEVC_CAB_TFM_SKIP12;
+ }
+ TRACE_CABAC_CTXT("transform_skip_flag", ps_cabac->u4_range, ctxt_idx);
+ value = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ ctxt_idx);
+ AEV_TRACE("transform_skip_flag", value, ps_cabac->u4_range);
+ transform_skip_flag = value;
+ }
+
+ /* code the last_coeff_x_prefix as tunary binarized code */
+ {
+ WORD32 ctxt_idx_x, ctxt_idx_y, ctx_shift;
+ WORD32 ctx_offset;
+ c_max = (log2_trafo_size << 1) - 1;
+
+ if(!c_idx)
+ {
+ ctx_offset = (3 * (log2_trafo_size - 2)) + ((log2_trafo_size - 1) >> 2);
+ ctxt_idx_x = IHEVC_CAB_COEFFX_PREFIX + ctx_offset;
+ ctxt_idx_y = IHEVC_CAB_COEFFY_PREFIX + ctx_offset;
+ ctx_shift = (log2_trafo_size + 1) >> 2;
+ }
+ else
+ {
+ ctxt_idx_x = IHEVC_CAB_COEFFX_PREFIX + 15;
+ ctxt_idx_y = IHEVC_CAB_COEFFY_PREFIX + 15;
+ ctx_shift = log2_trafo_size - 2;
+ }
+
+ TRACE_CABAC_CTXT("last_coeff_x_prefix", ps_cabac->u4_range, ctxt_idx_x);
+ last_significant_coeff_x_prefix = ihevcd_cabac_decode_bins_tunary(ps_cabac,
+ ps_bitstrm,
+ c_max,
+ ctxt_idx_x,
+ ctx_shift,
+ c_max);
+
+ AEV_TRACE("last_coeff_x_prefix", last_significant_coeff_x_prefix, ps_cabac->u4_range);
+
+ TRACE_CABAC_CTXT("last_coeff_y_prefix", ps_cabac->u4_range, ctxt_idx_y);
+ last_significant_coeff_y_prefix = ihevcd_cabac_decode_bins_tunary(ps_cabac,
+ ps_bitstrm,
+ c_max,
+ ctxt_idx_y,
+ ctx_shift,
+ c_max);
+
+ AEV_TRACE("last_coeff_y_prefix", last_significant_coeff_y_prefix, ps_cabac->u4_range);
+
+
+ last_significant_coeff_x = last_significant_coeff_x_prefix;
+ if(last_significant_coeff_x_prefix > 3)
+ {
+ WORD32 suf_length = ((last_significant_coeff_x_prefix - 2) >> 1);
+
+ value = ihevcd_cabac_decode_bypass_bins(ps_cabac,
+ ps_bitstrm,
+ suf_length);
+
+ AEV_TRACE("last_coeff_x_suffix", value, ps_cabac->u4_range);
+
+
+ last_significant_coeff_x =
+ (1 << ((last_significant_coeff_x_prefix >> 1) - 1)) *
+ (2 + (last_significant_coeff_x_prefix & 1)) + value;
+ }
+
+
+ last_significant_coeff_y = last_significant_coeff_y_prefix;
+ if(last_significant_coeff_y_prefix > 3)
+ {
+ WORD32 suf_length = ((last_significant_coeff_y_prefix - 2) >> 1);
+ value = ihevcd_cabac_decode_bypass_bins(ps_cabac,
+ ps_bitstrm,
+ suf_length);
+
+ AEV_TRACE("last_coeff_y_suffix", value, ps_cabac->u4_range);
+ last_significant_coeff_y =
+ (1 << ((last_significant_coeff_y_prefix >> 1) - 1)) *
+ (2 + (last_significant_coeff_y_prefix & 1)) + value;
+ }
+
+ }
+
+ /* Choose a scan matrix based on intra flag, intra pred mode, transform size
+ and luma/chroma */
+ scan_idx = SCAN_DIAG_UPRIGHT;
+ if(PRED_MODE_INTRA == ps_codec->s_parse.s_cu.i4_pred_mode)
+ {
+ if((2 == log2_trafo_size) || ((3 == log2_trafo_size) && (0 == c_idx)))
+ {
+ if((6 <= intra_pred_mode) &&
+ (14 >= intra_pred_mode))
+ {
+ scan_idx = SCAN_VERT;
+ }
+ else if((22 <= intra_pred_mode) &&
+ (30 >= intra_pred_mode))
+ {
+ scan_idx = SCAN_HORZ;
+ }
+ }
+ }
+
+ /* In case the scan is vertical, then swap X and Y positions */
+ if(SCAN_VERT == scan_idx)
+ {
+ SWAP(last_significant_coeff_x, last_significant_coeff_y);
+ }
+
+ {
+ WORD8 *pi1_scan_idx;
+ WORD8 *pi1_buf = (WORD8 *)ps_codec->s_parse.pv_tu_coeff_data;
+
+ /* First WORD8 gives number of coded subblocks */
+ pi1_num_coded_subblks = pi1_buf++;
+
+ /* Set number of coded subblocks in the current TU to zero */
+ /* This will be updated later */
+ *pi1_num_coded_subblks = 0;
+
+ /* Second WORD8 gives (scan idx << 1) | trans_skip */
+ pi1_scan_idx = pi1_buf++;
+ *pi1_scan_idx = (scan_idx << 1) | transform_skip_flag;
+
+ /* Store the incremented pointer in pv_tu_coeff_data */
+ ps_codec->s_parse.pv_tu_coeff_data = pi1_buf;
+
+ }
+ /**
+ * Given last_significant_coeff_y and last_significant_coeff_x find last sub block
+ * This is done by ignoring lower two bits of last_significant_coeff_y and last_significant_coeff_x
+ * and using scan matrix for lookup
+ */
+
+ /* If transform is 4x4, last_sub_blk is zero */
+ last_sub_blk = 0;
+
+ /* If transform is larger than 4x4, then based on scan_idx and transform size, choose a scan table */
+
+ if(log2_trafo_size > 2)
+ {
+ WORD32 scan_pos;
+ WORD32 scan_mat_size;
+ pu1_scan_blk = (UWORD8 *)gapv_ihevc_scan[scan_idx * 3 + (log2_trafo_size - 2 - 1)];
+
+
+ /* Divide the current transform to 4x4 subblocks and count number of 4x4 in the first row */
+ /* This will be size of scan matrix to be used for subblock scanning */
+ scan_mat_size = 1 << (log2_trafo_size - 2);
+ scan_pos = ((last_significant_coeff_y >> 2) * scan_mat_size) +
+ (last_significant_coeff_x >> 2);
+
+ last_sub_blk = pu1_scan_blk[scan_pos];
+ }
+ pu1_scan_coeff = &gau1_ihevc_scan4x4[scan_idx][0];
+
+ {
+ WORD32 scan_pos;
+
+ scan_pos = ((last_significant_coeff_y & 3) << 2) +
+ (last_significant_coeff_x & 3);
+
+ last_scan_pos = pu1_scan_coeff[scan_pos];
+ }
+ pu1_scan_blk = (UWORD8 *)gapv_ihevc_invscan[scan_idx * 3 + (log2_trafo_size - 2 - 1)];
+ pu1_scan_coeff = &gau1_ihevc_invscan4x4[scan_idx][0];
+
+ /* Set CSBF array to zero */
+ {
+ UWORD32 *pu4_csbf;
+ pu4_csbf = (void *)au2_csbf;
+ *pu4_csbf++ = 0;
+ *pu4_csbf++ = 0;
+ *pu4_csbf++ = 0;
+ *pu4_csbf = 0;
+ /* To avoid a check for y pos, 9th WORD16 in the array is set to zero */
+ au2_csbf[8] = 0;
+ }
+
+ /*************************************************************************/
+ /* derive base context index for sig coeff as per section 9.3.3.1.4 */
+ /* TODO; convert to look up based on luma/chroma, scan type and tfr size */
+ /*************************************************************************/
+ if(!c_idx)
+ {
+ sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG;
+ abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG;
+
+ if(3 == log2_trafo_size)
+ {
+ /* 8x8 transform size */
+ sig_coeff_base_ctxt += (scan_idx == SCAN_DIAG_UPRIGHT) ? 9 : 15;
+ }
+ else if(3 < log2_trafo_size)
+ {
+ /* larger transform sizes */
+ sig_coeff_base_ctxt += 21;
+ }
+ }
+ else
+ {
+ /* chroma context initializations */
+ sig_coeff_base_ctxt = IHEVC_CAB_COEFF_FLAG + 27;
+ abs_gt1_base_ctxt = IHEVC_CAB_COEFABS_GRTR1_FLAG + 16;
+
+ if(3 == log2_trafo_size)
+ {
+ /* 8x8 transform size */
+ sig_coeff_base_ctxt += 9;
+ }
+ else if(3 < log2_trafo_size)
+ {
+ /* larger transform sizes */
+ sig_coeff_base_ctxt += 12;
+ }
+ }
+ num_subblks = 0;
+ /* Parse each 4x4 subblocks */
+ for(i = last_sub_blk; i >= 0; i--)
+ {
+ WORD32 sub_blk_pos;
+ WORD32 infer_sig_coeff_flag;
+ WORD32 cur_csbf;
+
+ WORD32 n;
+ WORD32 num_coeff;
+ /* Sig coeff map for 16 entries in raster scan order. Upper 16 bits are used.
+ * MSB gives sig coeff flag for 0th coeff and so on
+ * UWORD16 would have been enough but kept as UWORD32 for code optimizations
+ * In arm unnecessary masking operations are saved
+ */
+ UWORD32 u4_sig_coeff_map_raster;
+ WORD32 sign_hidden;
+
+ /* Sig coeff map in scan order */
+ UWORD32 u4_sig_coeff_map;
+ WORD32 coeff_abs_level_greater2_flag;
+ UWORD32 u4_coeff_abs_level_greater1_map;
+ UWORD32 u4_coeff_abs_level_greater2_map;
+ UWORD32 u4_coeff_sign_map;
+ WORD32 first_sig_scan_pos, last_sig_scan_pos, num_greater1_flag, first_greater1_scan_pos;
+ WORD32 num_sig_coeff, sum_abs_level;
+ WORD32 nbr_csbf;
+
+
+ WORD32 ctxt_set;
+ WORD32 rice_param;
+ WORD32 xs, ys;
+
+
+ sub_blk_pos = 0;
+ if(i && (log2_trafo_size > 2))
+ sub_blk_pos = pu1_scan_blk[i];
+
+ /* Get xs and ys from scan position */
+ /* This is needed for context modelling of significant coeff flag */
+ xs = sub_blk_pos & ((1 << (log2_trafo_size - 2)) - 1);
+ ys = sub_blk_pos >> (log2_trafo_size - 2);
+
+
+ /* Check if neighbor subblocks are coded */
+ {
+
+ nbr_csbf = 0;
+
+ /* Get Bottom sub blocks CSBF */
+ nbr_csbf |= (au2_csbf[ys + 1] >> xs) & 1;
+ nbr_csbf <<= 1;
+
+ /* Get Right sub blocks CSBF */
+ /* Even if xs is equal to (1 << (log2_trafo_size - 2 )) - 1,
+ since au2_csbf is set to zero at the beginning, csbf for
+ neighbor will be read as 0 */
+
+ nbr_csbf |= (au2_csbf[ys] >> (xs + 1)) & 1;
+
+
+ }
+ cur_csbf = 0;
+
+ /* DC coeff is inferred, only if coded_sub_block is explicitly parsed as 1 */
+ /* i.e. it is not inferred for first and last subblock */
+ infer_sig_coeff_flag = 0;
+ if((i < last_sub_blk) && (i > 0))
+ {
+ WORD32 ctxt_idx = IHEVC_CAB_CODED_SUBLK_IDX;
+
+ /* ctxt based on right / bottom avail csbf, section 9.3.3.1.3 */
+ ctxt_idx += (nbr_csbf) ? 1 : 0;
+
+ /* Ctxt based on luma or chroma */
+ ctxt_idx += c_idx ? 2 : 0;
+ TRACE_CABAC_CTXT("coded_sub_block_flag", ps_cabac->u4_range, ctxt_idx);
+ IHEVCD_CABAC_DECODE_BIN(cur_csbf, ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("coded_sub_block_flag", cur_csbf, ps_cabac->u4_range);
+
+ infer_sig_coeff_flag = 1;
+ }
+ else /* if((i == last_sub_blk) || (sub_blk_pos == 0)) */
+ {
+ /* CSBF is set to 1 for first and last subblock */
+ /* Note for these subblocks sig_coeff_map is not inferred but instead parsed */
+ cur_csbf = 1;
+ }
+
+ /* Set current sub blocks CSBF */
+ {
+ UWORD32 u4_mask = 1 << xs;
+ if(cur_csbf)
+ au2_csbf[ys] |= u4_mask;
+ else
+ au2_csbf[ys] &= ~u4_mask;
+
+ }
+
+ /* If current subblock is not coded, proceed to the next subblock */
+ if(0 == cur_csbf)
+ continue;
+
+ n = 15;
+ u4_sig_coeff_map_raster = 0;
+ u4_sig_coeff_map = 0;
+ num_coeff = 0;
+ if(i == last_sub_blk)
+ {
+ WORD32 pos = ((last_significant_coeff_y & 3) << 2) +
+ (last_significant_coeff_x & 3);
+ n = (last_scan_pos - 1);
+ /* Set Significant coeff map for last significant coeff flag as 1 */
+ u4_sig_coeff_map_raster = 1 << pos;
+ u4_sig_coeff_map = 1 << last_scan_pos;
+ num_coeff = 1;
+ }
+
+ for(; n >= 0; n--)
+ {
+ WORD32 significant_coeff_flag;
+
+ if((n > 0 || !infer_sig_coeff_flag))
+ {
+ //WORD32 coeff_pos;
+ WORD32 sig_ctxinc;
+ WORD32 ctxt_idx;
+
+ /* Coefficient position is needed for deriving context index for significant_coeff_flag */
+ //coeff_pos = pu1_scan_coeff[n];
+ /* derive the context inc as per section 9.3.3.1.4 */
+ sig_ctxinc = 0;
+ if(2 == log2_trafo_size)
+ {
+
+ /* 4x4 transform size increment uses lookup */
+ sig_ctxinc = gau1_ihevcd_sigcoeff_ctxtinc_tr4[scan_idx][n];
+ }
+ else if(n || i)
+ {
+ /* ctxt for AC coeff depends on curpos and neigbour csbf */
+ sig_ctxinc = gau1_ihevcd_sigcoeff_ctxtinc[scan_idx][nbr_csbf][n];
+
+ /* based on luma subblock pos */
+ sig_ctxinc += (i && (!c_idx)) ? 3 : 0;
+
+ }
+ else
+ {
+ /* DC coeff has fixed context for luma and chroma */
+ sig_coeff_base_ctxt = (0 == c_idx) ? IHEVC_CAB_COEFF_FLAG :
+ (IHEVC_CAB_COEFF_FLAG + 27);
+ }
+
+ ctxt_idx = sig_ctxinc + sig_coeff_base_ctxt;
+ TRACE_CABAC_CTXT("significant_coeff_flag", ps_cabac->u4_range, ctxt_idx);
+ IHEVCD_CABAC_DECODE_BIN(significant_coeff_flag, ps_cabac,
+ ps_bitstrm,
+ ctxt_idx);
+ AEV_TRACE("significant_coeff_flag", significant_coeff_flag, ps_cabac->u4_range);
+
+
+ /* If at least one non-zero coeff is signalled then do not infer sig coeff map */
+ /* for (0,0) coeff in the current sub block */
+ if(significant_coeff_flag)
+ infer_sig_coeff_flag = 0;
+
+// u4_sig_coeff_map_raster |= significant_coeff_flag
+// << coeff_pos;
+ u4_sig_coeff_map |= significant_coeff_flag << n;
+ num_coeff += significant_coeff_flag;
+ }
+
+
+ }
+ /*********************************************************************/
+ /* If infer_sig_coeff_flag is 1 then treat the 0th coeff as non zero */
+ /* If infer_sig_coeff_flag is zero, then last significant_coeff_flag */
+ /* is parsed in the above loop */
+ /*********************************************************************/
+ if(infer_sig_coeff_flag)
+ {
+ u4_sig_coeff_map_raster |= 1;
+ u4_sig_coeff_map |= 1;
+ num_coeff++;
+ }
+
+ /*********************************************************************/
+ /* First subblock does not get an explicit csbf. It is assumed to */
+ /* be 1. For this subblock there is chance of getting all */
+ /* sig_coeff_flags to be zero. In such a case proceed to the next */
+ /* subblock(which is end of parsing for the current transform block) */
+ /*********************************************************************/
+
+ if(0 == num_coeff)
+ continue;
+
+ /* Increment number of coded subblocks for the current TU */
+ num_subblks++;
+
+ /* Set sig coeff map and subblock position */
+ ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)ps_codec->s_parse.pv_tu_coeff_data;
+ ps_tu_sblk_coeff_data->u2_sig_coeff_map = u4_sig_coeff_map;
+ ps_tu_sblk_coeff_data->u2_subblk_pos = (ys << 8) | xs;
+
+ first_sig_scan_pos = 16;
+ last_sig_scan_pos = -1;
+ num_greater1_flag = 0;
+ first_greater1_scan_pos = -1;
+ u4_coeff_abs_level_greater1_map = 0;
+
+
+ /* context set based on luma subblock pos */
+ ctxt_set = (i && (!c_idx)) ? 2 : 0;
+
+ /* See section 9.3.3.1.5 */
+ ctxt_set += (0 == gt1_ctxt) ? 1 : 0;
+
+ gt1_ctxt = 1;
+ /* Instead of initializing n to 15, set it to 31-CLZ(sig coeff map) */
+ {
+ UWORD32 u4_sig_coeff_map_shift;
+ UWORD32 clz;
+ clz = CLZ(u4_sig_coeff_map);
+ n = 31 - clz;
+ u4_sig_coeff_map_shift = u4_sig_coeff_map << clz;
+ /* For loop for n changed to do while to break early if sig_coeff_map_shift becomes zero */
+ do
+ {
+ //WORD32 coeff_pos;
+ WORD32 ctxt_idx;
+
+ //TODO: Scan lookup will be removed later and instead u4_sig_coeff_map will be used
+ //coeff_pos = pu1_scan_coeff[n];
+
+ if((u4_sig_coeff_map_shift >> 31) & 1)
+ {
+
+ /* abs_level_greater1_flag is sent for only first 8 non-zero levels in a subblock */
+ if(num_greater1_flag < 8)
+ {
+ WORD32 coeff_abs_level_greater1_flag;
+
+ ctxt_idx = (ctxt_set * 4) + abs_gt1_base_ctxt + gt1_ctxt;
+
+ TRACE_CABAC_CTXT("coeff_abs_level_greater1_flag", ps_cabac->u4_range, ctxt_idx);
+ IHEVCD_CABAC_DECODE_BIN(coeff_abs_level_greater1_flag, ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("coeff_abs_level_greater1_flag", coeff_abs_level_greater1_flag, ps_cabac->u4_range);
+
+ u4_coeff_abs_level_greater1_map |= coeff_abs_level_greater1_flag << n;
+ num_greater1_flag++;
+
+ /* first_greater1_scan_pos is obtained using CLZ on u4_coeff_abs_level_greater1_map*/
+ /* outside the loop instead of the following check inside the loop */
+ /* if( coeff_abs_level_greater1_flag && first_greater1_scan_pos == -1) */
+ /* first_greater1_scan_pos = n; */
+
+ if(coeff_abs_level_greater1_flag)
+ {
+ gt1_ctxt = 0;
+ }
+ else if(gt1_ctxt && (gt1_ctxt < 3))
+ {
+ gt1_ctxt++;
+ }
+
+ }
+ else
+ break;
+
+ /* instead of computing last and first significan scan position using checks below */
+ /* They are computed outside the loop using CLZ and CTZ on sig_coeff_map */
+ /* if(last_sig_scan_pos == -1) */
+ /* last_sig_scan_pos = n; */
+ /* first_sig_scan_pos = n; */
+ }
+ u4_sig_coeff_map_shift <<= 1;
+ n--;
+ /* If there are zero coeffs, then shift by as many zero coeffs and decrement n */
+ clz = CLZ(u4_sig_coeff_map_shift);
+ u4_sig_coeff_map_shift <<= clz;
+ n -= clz;
+ }while(u4_sig_coeff_map_shift);
+ }
+ /* At this level u4_sig_coeff_map is non-zero i.e. has atleast one non-zero coeff */
+ last_sig_scan_pos = (31 - CLZ(u4_sig_coeff_map));
+ first_sig_scan_pos = CTZ(u4_sig_coeff_map);
+ sign_hidden = (((last_sig_scan_pos - first_sig_scan_pos) > 3) && !ps_codec->s_parse.s_cu.i4_cu_transquant_bypass);
+
+ u4_coeff_abs_level_greater2_map = 0;
+
+ if(u4_coeff_abs_level_greater1_map)
+ {
+ /* Check if the first level > 1 is greater than 2 */
+ WORD32 ctxt_idx;
+ first_greater1_scan_pos = (31 - CLZ(u4_coeff_abs_level_greater1_map));
+
+
+ ctxt_idx = IHEVC_CAB_COEFABS_GRTR2_FLAG;
+
+ ctxt_idx += (!c_idx) ? ctxt_set : (ctxt_set + 4);
+ TRACE_CABAC_CTXT("coeff_abs_level_greater2_flag", ps_cabac->u4_range, ctxt_idx);
+ IHEVCD_CABAC_DECODE_BIN(coeff_abs_level_greater2_flag, ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("coeff_abs_level_greater2_flag", coeff_abs_level_greater2_flag, ps_cabac->u4_range);
+ u4_coeff_abs_level_greater2_map = coeff_abs_level_greater2_flag << first_greater1_scan_pos;
+ }
+
+
+ u4_coeff_sign_map = 0;
+
+ /* Parse sign flags */
+ if(!sign_data_hiding_flag || !sign_hidden)
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BINS(value, ps_cabac, ps_bitstrm, num_coeff);
+ AEV_TRACE("sign_flags", value, ps_cabac->u4_range);
+ u4_coeff_sign_map = value << (32 - num_coeff);
+ }
+ else
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BINS(value, ps_cabac, ps_bitstrm, (num_coeff - 1));
+ AEV_TRACE("sign_flags", value, ps_cabac->u4_range);
+ u4_coeff_sign_map = value << (32 - (num_coeff - 1));
+ }
+
+ num_sig_coeff = 0;
+ sum_abs_level = 0;
+ rice_param = 0;
+ {
+ UWORD32 clz;
+ UWORD32 u4_sig_coeff_map_shift;
+ clz = CLZ(u4_sig_coeff_map);
+ n = 31 - clz;
+ u4_sig_coeff_map_shift = u4_sig_coeff_map << clz;
+ /* For loop for n changed to do while to break early if sig_coeff_map_shift becomes zero */
+ do
+ {
+
+ if((u4_sig_coeff_map_shift >> 31) & 1)
+ {
+ WORD32 base_lvl;
+ WORD32 coeff_abs_level_remaining;
+ WORD32 level;
+ base_lvl = 1;
+
+ /* Update base_lvl if it is greater than 1 */
+ if((u4_coeff_abs_level_greater1_map >> n) & 1)
+ base_lvl++;
+
+ /* Update base_lvl if it is greater than 2 */
+ if((u4_coeff_abs_level_greater2_map >> n) & 1)
+ base_lvl++;
+
+ /* If level is greater than 3/2/1 based on the greater1 and greater2 maps,
+ * decode remaining level (level - base_lvl) will be signalled as bypass bins
+ */
+ coeff_abs_level_remaining = 0;
+ if(base_lvl == ((num_sig_coeff < 8) ? ((n == first_greater1_scan_pos) ? 3 : 2) : 1))
+ {
+ UWORD32 u4_prefix;
+ WORD32 bin;
+
+ u4_prefix = 0;
+
+ do
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BIN(bin, ps_cabac, ps_bitstrm);
+ u4_prefix++;
+
+ if((WORD32)u4_prefix == 19 - rice_param)
+ {
+ bin = 1;
+ break;
+ }
+
+ }while(bin);
+
+ u4_prefix = u4_prefix - 1;
+ if(u4_prefix < 3)
+ {
+ UWORD32 u4_suffix;
+
+ coeff_abs_level_remaining = (u4_prefix << rice_param);
+ if(rice_param)
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, rice_param);
+
+ coeff_abs_level_remaining |= u4_suffix;
+ }
+ }
+ else
+ {
+ UWORD32 u4_suffix;
+ UWORD32 u4_numbins;
+
+ //u4_prefix = CLIP3(u4_prefix, 0, 19 - rice_param);
+
+ u4_numbins = (u4_prefix - 3 + rice_param);
+ coeff_abs_level_remaining = (((1 << (u4_prefix - 3)) + 3 - 1) << rice_param);
+ if(u4_numbins)
+ {
+ IHEVCD_CABAC_DECODE_BYPASS_BINS(u4_suffix, ps_cabac, ps_bitstrm, u4_numbins);
+ coeff_abs_level_remaining += u4_suffix;
+ }
+ }
+
+
+ AEV_TRACE("coeff_abs_level_remaining", coeff_abs_level_remaining, ps_cabac->u4_range);
+ base_lvl += coeff_abs_level_remaining;
+
+ }
+
+ /* update the rice param based on coeff level */
+ if((base_lvl > (3 << rice_param)) && (rice_param < 4))
+ {
+ rice_param++;
+ }
+
+ /* Compute absolute level */
+ level = base_lvl;
+
+ /* Update level with the sign */
+ if((u4_coeff_sign_map >> 31) & 1)
+ level = -level;
+
+ u4_coeff_sign_map <<= 1;
+ /* Update sign in case sign is hidden */
+ if(sign_data_hiding_flag && sign_hidden)
+ {
+ sum_abs_level += base_lvl;
+
+ if(n == first_sig_scan_pos && ((sum_abs_level % 2) == 1))
+ level = -level;
+ }
+
+ /* Store the resulting level in non-zero level array */
+ ps_tu_sblk_coeff_data->ai2_level[num_sig_coeff++] = level;
+ //AEV_TRACE("level", level, 0);
+ }
+ u4_sig_coeff_map_shift <<= 1;
+ n--;
+ /* If there are zero coeffs, then shift by as many zero coeffs and decrement n */
+ clz = CLZ(u4_sig_coeff_map_shift);
+ u4_sig_coeff_map_shift <<= clz;
+ n -= clz;
+
+
+ }while(u4_sig_coeff_map_shift);
+ }
+
+ /* Increment the pv_tu_sblk_coeff_data */
+ {
+ UWORD8 *pu1_buf = (UWORD8 *)ps_codec->s_parse.pv_tu_coeff_data;
+ pu1_buf += sizeof(tu_sblk_coeff_data_t) - SUBBLK_COEFF_CNT * sizeof(WORD16);
+ pu1_buf += num_coeff * sizeof(WORD16);
+ ps_codec->s_parse.pv_tu_coeff_data = pu1_buf;
+
+ }
+
+ }
+ /* Set number of coded sub blocks in the current TU */
+ *pi1_num_coded_subblks = num_subblks;
+
+ return ret;
+}
diff --git a/decoder/ihevcd_parse_residual.h b/decoder/ihevcd_parse_residual.h
new file mode 100644
index 0000000..792a162
--- /dev/null
+++ b/decoder/ihevcd_parse_residual.h
@@ -0,0 +1,45 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_residual.h
+*
+* @brief
+* Parsing of residual data
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_RESIDUAL_H_
+#define _IHEVCD_PARSE_RESIDUAL_H_
+WORD32 ihevcd_parse_residual_coding(codec_t *ps_codec,
+ WORD32 x0, WORD32 y0,
+ WORD32 log2_trafo_size,
+ WORD32 c_idx,
+ WORD32 intra_pred_mode);
+
+#endif /* _IHEVCD_PARSE_RESIDUAL_H_ */
diff --git a/decoder/ihevcd_parse_slice.c b/decoder/ihevcd_parse_slice.c
new file mode 100644
index 0000000..8f81e64
--- /dev/null
+++ b/decoder/ihevcd_parse_slice.c
@@ -0,0 +1,3525 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_parse_slice.c
+ *
+ * @brief
+ * Contains functions for parsing slice data
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_mem_fns.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_error.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_parse_slice.h"
+#include "ihevcd_parse_residual.h"
+#include "ihevcd_cabac.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_intra_pred_mode_prediction.h"
+#include "ihevcd_common_tables.h"
+#include "ihevcd_process_slice.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#include "ihevcd_debug.h"
+#include "ihevcd_get_mv.h"
+#include "ihevcd_boundary_strength.h"
+#include "ihevcd_ilf_padding.h"
+#include "ihevcd_statistics.h"
+/* Bit stream offset threshold */
+#define BITSTRM_OFF_THRS 8
+
+/**
+ * Table used to decode part_mode if AMP is enabled and current CU is not min CU
+ */
+const UWORD8 gau1_part_mode_amp[] = { PART_nLx2N, PART_nRx2N, PART_Nx2N, 0xFF, PART_2NxnU, PART_2NxnD, PART_2NxN, 0xFF };
+
+const UWORD32 gau4_ct_depth_mask[] = { 0x0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses Transform tree syntax
+ *
+ * @par Description:
+ * Parses Transform tree syntax as per Section:7.3.9.8
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Status
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+WORD32 ihevcd_parse_transform_tree(codec_t *ps_codec,
+ WORD32 x0, WORD32 y0,
+ WORD32 cu_x_base, WORD32 cu_y_base,
+ WORD32 log2_trafo_size,
+ WORD32 trafo_depth,
+ WORD32 blk_idx,
+ WORD32 intra_pred_mode)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ WORD32 value;
+ WORD32 x1, y1;
+ WORD32 max_trafo_depth;
+
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD32 intra_split_flag;
+ WORD32 split_transform_flag;
+ WORD32 ctxt_idx;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+ max_trafo_depth = ps_codec->s_parse.s_cu.i4_max_trafo_depth;
+ ps_sps = ps_codec->s_parse.ps_sps;
+ ps_pps = ps_codec->s_parse.ps_pps;
+ intra_split_flag = ps_codec->s_parse.s_cu.i4_intra_split_flag;
+
+ {
+ split_transform_flag = 0;
+ if((log2_trafo_size <= ps_sps->i1_log2_max_transform_block_size) &&
+ (log2_trafo_size > ps_sps->i1_log2_min_transform_block_size) &&
+ (trafo_depth < max_trafo_depth) &&
+ !(intra_split_flag && (trafo_depth == 0)))
+ {
+ /* encode the split transform flag, context derived as per Table9-37 */
+ ctxt_idx = IHEVC_CAB_SPLIT_TFM + (5 - log2_trafo_size);
+
+ TRACE_CABAC_CTXT("split_transform_flag", ps_cabac->u4_range, ctxt_idx);
+ split_transform_flag = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("split_transform_flag", split_transform_flag,
+ ps_cabac->u4_range);
+
+ }
+ else
+ {
+ WORD32 inter_split_flag = 0;
+
+ if((0 == ps_sps->i1_max_transform_hierarchy_depth_inter) &&
+ (PRED_MODE_INTER == ps_codec->s_parse.s_cu.i4_pred_mode) &&
+ (PART_2Nx2N != ps_codec->s_parse.s_cu.i4_part_mode) &&
+ (0 == trafo_depth))
+ {
+ inter_split_flag = 1;
+ }
+
+ if((log2_trafo_size > ps_sps->i1_log2_max_transform_block_size) ||
+ ((1 == intra_split_flag) && (0 == trafo_depth)) ||
+ (1 == inter_split_flag))
+ {
+ split_transform_flag = 1;
+ }
+ }
+
+ if(0 == trafo_depth)
+ {
+ ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth] = 0;
+ ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] = 0;
+ }
+ else
+ {
+ ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] = ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth - 1];
+ ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth] = ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth - 1];
+ }
+ if(trafo_depth == 0 || log2_trafo_size > 2)
+ {
+ ctxt_idx = IHEVC_CAB_CBCR_IDX + trafo_depth;
+ /* CBF for Cb/Cr is sent only if the parent CBF for Cb/Cr is non-zero */
+ if((trafo_depth == 0) || ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth - 1])
+ {
+ TRACE_CABAC_CTXT("cbf_cb", ps_cabac->u4_range, ctxt_idx);
+ value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("cbf_cb", value, ps_cabac->u4_range);
+ ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] = value;
+ }
+
+ if((trafo_depth == 0) || ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth - 1])
+ {
+ TRACE_CABAC_CTXT("cbf_cr", ps_cabac->u4_range, ctxt_idx);
+ value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("cbf_cr", value, ps_cabac->u4_range);
+ ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth] = value;
+ }
+ }
+ if(split_transform_flag)
+ {
+ WORD32 intra_pred_mode_tmp;
+ x1 = x0 + ((1 << log2_trafo_size) >> 1);
+ y1 = y0 + ((1 << log2_trafo_size) >> 1);
+
+ /* For transform depth of zero, intra pred mode as decoded at CU */
+ /* level is sent to the transform tree nodes */
+ /* When depth is non-zero intra pred mode of parent node is sent */
+ /* This takes care of passing correct mode to all the child nodes */
+ intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0];
+ ihevcd_parse_transform_tree(ps_codec, x0, y0, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 0, intra_pred_mode_tmp);
+
+ intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[1];
+ ihevcd_parse_transform_tree(ps_codec, x1, y0, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 1, intra_pred_mode_tmp);
+
+ intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[2];
+ ihevcd_parse_transform_tree(ps_codec, x0, y1, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 2, intra_pred_mode_tmp);
+
+ intra_pred_mode_tmp = trafo_depth ? intra_pred_mode : ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[3];
+ ihevcd_parse_transform_tree(ps_codec, x1, y1, x0, y0, log2_trafo_size - 1, trafo_depth + 1, 3, intra_pred_mode_tmp);
+
+ }
+ else
+ {
+ WORD32 ctb_x_base;
+ WORD32 ctb_y_base;
+ WORD32 cu_qp_delta_abs;
+
+
+
+ tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+ cu_qp_delta_abs = 0;
+ ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+ ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+ if((ps_codec->s_parse.s_cu.i4_pred_mode == PRED_MODE_INTRA) ||
+ (trafo_depth != 0) ||
+ (ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth]) ||
+ (ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth]))
+ {
+ ctxt_idx = IHEVC_CAB_CBF_LUMA_IDX;
+ ctxt_idx += (trafo_depth == 0) ? 1 : 0;
+
+ TRACE_CABAC_CTXT("cbf_luma", ps_cabac->u4_range, ctxt_idx);
+ value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("cbf_luma", value, ps_cabac->u4_range);
+
+ ps_codec->s_parse.s_cu.i1_cbf_luma = value;
+ }
+ else
+ {
+ ps_codec->s_parse.s_cu.i1_cbf_luma = 1;
+ }
+
+ /* Initialize ps_tu to default values */
+ /* If required change this to WORD32 packed write */
+ ps_tu->b1_cb_cbf = 0;
+ ps_tu->b1_cr_cbf = 0;
+ ps_tu->b1_y_cbf = 0;
+ ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+ ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+ ps_tu->b1_transquant_bypass = ps_codec->s_parse.s_cu.i4_cu_transquant_bypass;
+ ps_tu->b3_size = (log2_trafo_size - 2);
+ ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+
+ ps_tu->b6_luma_intra_mode = intra_pred_mode;
+ ps_tu->b3_chroma_intra_mode_idx = ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx;
+
+ /* Section:7.3.12 Transform unit syntax inlined here */
+ if(ps_codec->s_parse.s_cu.i1_cbf_luma ||
+ ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth] ||
+ ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth])
+ {
+ WORD32 intra_pred_mode_chroma;
+ if(ps_pps->i1_cu_qp_delta_enabled_flag && !ps_codec->s_parse.i4_is_cu_qp_delta_coded)
+ {
+
+
+ WORD32 c_max = TU_MAX_QP_DELTA_ABS;
+ WORD32 ctxt_inc = IHEVC_CAB_QP_DELTA_ABS;
+ WORD32 ctxt_inc_max = CTXT_MAX_QP_DELTA_ABS;
+
+ TRACE_CABAC_CTXT("cu_qp_delta_abs", ps_cabac->u4_range, ctxt_inc);
+ /* qp_delta_abs is coded as combination of tunary and eg0 code */
+ /* See Table 9-32 and Table 9-37 for details on cu_qp_delta_abs */
+ cu_qp_delta_abs = ihevcd_cabac_decode_bins_tunary(ps_cabac,
+ ps_bitstrm,
+ c_max,
+ ctxt_inc,
+ 0,
+ ctxt_inc_max);
+ if(cu_qp_delta_abs >= c_max)
+ {
+ value = ihevcd_cabac_decode_bypass_bins_egk(ps_cabac, ps_bitstrm, 0);
+ cu_qp_delta_abs += value;
+ }
+ AEV_TRACE("cu_qp_delta_abs", cu_qp_delta_abs, ps_cabac->u4_range);
+
+
+ ps_codec->s_parse.i4_is_cu_qp_delta_coded = 1;
+
+
+ if(cu_qp_delta_abs)
+ {
+ value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+ AEV_TRACE("cu_qp_delta_sign", value, ps_cabac->u4_range);
+
+ if(value)
+ cu_qp_delta_abs = -cu_qp_delta_abs;
+
+ }
+ ps_codec->s_parse.s_cu.i4_cu_qp_delta = cu_qp_delta_abs;
+
+ }
+
+ if(ps_codec->s_parse.s_cu.i1_cbf_luma)
+ {
+ ps_tu->b1_y_cbf = 1;
+ ihevcd_parse_residual_coding(ps_codec, x0, y0, log2_trafo_size, 0, intra_pred_mode);
+ }
+
+ if(4 == ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx)
+ intra_pred_mode_chroma = ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0];
+ else
+ {
+ intra_pred_mode_chroma = gau1_intra_pred_chroma_modes[ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx];
+
+ if(intra_pred_mode_chroma ==
+ ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0])
+ {
+ intra_pred_mode_chroma = INTRA_ANGULAR(34);
+ }
+
+ }
+ if(log2_trafo_size > 2)
+ {
+ if(ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth])
+ {
+ ps_tu->b1_cb_cbf = 1;
+ ihevcd_parse_residual_coding(ps_codec, x0, y0, log2_trafo_size - 1, 1, intra_pred_mode_chroma);
+ }
+
+ if(ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth])
+ {
+ ps_tu->b1_cr_cbf = 1;
+ ihevcd_parse_residual_coding(ps_codec, x0, y0, log2_trafo_size - 1, 2, intra_pred_mode_chroma);
+ }
+ }
+ else if(blk_idx == 3)
+ {
+ if(ps_codec->s_parse.s_cu.ai1_cbf_cb[trafo_depth])
+ {
+ ps_tu->b1_cb_cbf = 1;
+ ihevcd_parse_residual_coding(ps_codec, cu_x_base, cu_y_base, log2_trafo_size, 1, intra_pred_mode_chroma);
+ }
+
+ if(ps_codec->s_parse.s_cu.ai1_cbf_cr[trafo_depth])
+ {
+ ps_tu->b1_cr_cbf = 1;
+ ihevcd_parse_residual_coding(ps_codec, cu_x_base, cu_y_base, log2_trafo_size, 2, intra_pred_mode_chroma);
+ }
+ }
+ else
+ {
+ //ps_tu->b1_chroma_present = 0;
+ ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+ }
+ }
+ else
+ {
+ if((3 != blk_idx) && (2 == log2_trafo_size))
+ {
+ ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+ }
+ }
+
+ /* Set the first TU in CU flag */
+ {
+ if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+ (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+ {
+ ps_tu->b1_first_tu_in_cu = 1;
+ }
+ else
+ {
+ ps_tu->b1_first_tu_in_cu = 0;
+ }
+ }
+ ps_codec->s_parse.ps_tu++;
+ ps_codec->s_parse.s_cu.i4_tu_cnt++;
+ ps_codec->s_parse.i4_pic_tu_idx++;
+ }
+ }
+ return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses Motion vector difference
+ *
+ * @par Description:
+ * Parses Motion vector difference as per Section:7.3.9.9
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_parse_mvd(codec_t *ps_codec, mv_t *ps_mv)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+ WORD32 abs_mvd;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD32 abs_mvd_greater0_flag[2];
+ WORD32 abs_mvd_greater1_flag[2];
+ WORD32 ctxt_idx;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+
+ ctxt_idx = IHEVC_CAB_MVD_GRT0;
+ /* encode absmvd_x > 0 */
+ TRACE_CABAC_CTXT("abs_mvd_greater0_flag[0]", ps_cabac->u4_range, ctxt_idx);
+ abs_mvd_greater0_flag[0] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("abs_mvd_greater0_flag[0]", abs_mvd_greater0_flag[0], ps_cabac->u4_range);
+
+ /* encode absmvd_y > 0 */
+ TRACE_CABAC_CTXT("abs_mvd_greater0_flag[1]", ps_cabac->u4_range, ctxt_idx);
+ abs_mvd_greater0_flag[1] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("abs_mvd_greater0_flag[1]", abs_mvd_greater0_flag[1], ps_cabac->u4_range);
+
+ ctxt_idx = IHEVC_CAB_MVD_GRT1;
+ abs_mvd_greater1_flag[0] = 0;
+ abs_mvd_greater1_flag[1] = 0;
+
+ if(abs_mvd_greater0_flag[0])
+ {
+ TRACE_CABAC_CTXT("abs_mvd_greater1_flag[0]", ps_cabac->u4_range, ctxt_idx);
+ abs_mvd_greater1_flag[0] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("abs_mvd_greater1_flag[0]", abs_mvd_greater1_flag[0], ps_cabac->u4_range);
+ }
+ if(abs_mvd_greater0_flag[1])
+ {
+ TRACE_CABAC_CTXT("abs_mvd_greater1_flag[1]", ps_cabac->u4_range, ctxt_idx);
+ abs_mvd_greater1_flag[1] = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("abs_mvd_greater1_flag[1]", abs_mvd_greater1_flag[1], ps_cabac->u4_range);
+ }
+ abs_mvd = 0;
+ if(abs_mvd_greater0_flag[0])
+ {
+ abs_mvd = 1;
+ if(abs_mvd_greater1_flag[0])
+ {
+ value = ihevcd_cabac_decode_bypass_bins_egk(ps_cabac, ps_bitstrm, 1);
+ AEV_TRACE("abs_mvd_minus2[0]", value, ps_cabac->u4_range);
+ abs_mvd = value + 2;
+ }
+ value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+ AEV_TRACE("mvd_sign_flag[0]", value, ps_cabac->u4_range);
+ if(value)
+ {
+ abs_mvd = -abs_mvd;
+ }
+
+ }
+ ps_mv->i2_mvx = abs_mvd;
+ abs_mvd = 0;
+ if(abs_mvd_greater0_flag[1])
+ {
+ abs_mvd = 1;
+ if(abs_mvd_greater1_flag[1])
+ {
+ value = ihevcd_cabac_decode_bypass_bins_egk(ps_cabac, ps_bitstrm, 1);
+ AEV_TRACE("abs_mvd_minus2[1]", value, ps_cabac->u4_range);
+ abs_mvd = value + 2;
+
+ }
+ value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+ AEV_TRACE("mvd_sign_flag[1]", value, ps_cabac->u4_range);
+
+ if(value)
+ {
+ abs_mvd = -abs_mvd;
+ }
+ }
+ ps_mv->i2_mvy = abs_mvd;
+
+ return ret;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses PCM sample
+ *
+ *
+ * @par Description:
+ * Parses PCM sample as per Section:7.3.9.7 Pcm sample syntax
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T ihevcd_parse_pcm_sample(codec_t *ps_codec,
+ WORD32 x0,
+ WORD32 y0,
+ WORD32 log2_cb_size)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+ sps_t *ps_sps;
+
+ WORD32 value;
+ WORD32 i;
+
+ WORD32 num_bits;
+ UWORD32 u4_sig_coeff_map;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+ tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
+ UWORD8 *pu1_coeff_data;
+ ps_sps = ps_codec->s_parse.ps_sps;
+
+ UNUSED(value);
+ UNUSED(ps_tu);
+ UNUSED(ps_cabac);
+ UNUSED(x0);
+ UNUSED(y0);
+
+ {
+ WORD8 *pi1_scan_idx;
+ WORD8 *pi1_buf = (WORD8 *)ps_codec->s_parse.pv_tu_coeff_data;
+ WORD8 *pi1_num_coded_subblks;
+
+ /* First WORD8 gives number of coded subblocks */
+ pi1_num_coded_subblks = pi1_buf++;
+
+ /* Set number of coded subblocks in the current TU to zero */
+ /* For PCM there will be only one subblock which is the same size as CU */
+ *pi1_num_coded_subblks = 1;
+
+ /* Second WORD8 gives (scan idx << 1) | trans_skip */
+ pi1_scan_idx = pi1_buf++;
+ *pi1_scan_idx = (0 << 1) | 1;
+
+ /* Store the incremented pointer in pv_tu_coeff_data */
+ ps_codec->s_parse.pv_tu_coeff_data = pi1_buf;
+
+ }
+
+ u4_sig_coeff_map = 0xFFFFFFFF;
+ ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)ps_codec->s_parse.pv_tu_coeff_data;
+ ps_tu_sblk_coeff_data->u2_sig_coeff_map = u4_sig_coeff_map;
+ ps_tu_sblk_coeff_data->u2_subblk_pos = 0;
+
+ pu1_coeff_data = (UWORD8 *)&ps_tu_sblk_coeff_data->ai2_level[0];
+
+ num_bits = ps_sps->i1_pcm_sample_bit_depth_luma;
+
+ for(i = 0; i < 1 << (log2_cb_size << 1); i++)
+ {
+ TRACE_CABAC_CTXT("pcm_sample_luma", ps_cabac->u4_range, 0);
+ BITS_PARSE("pcm_sample_luma", value, ps_bitstrm, num_bits);
+
+ //ps_pcmsample_t->i1_pcm_sample_luma[i] = value;
+ *pu1_coeff_data++ = value << (BIT_DEPTH_LUMA - num_bits);
+ }
+
+ num_bits = ps_sps->i1_pcm_sample_bit_depth_chroma;
+
+ for(i = 0; i < (1 << (log2_cb_size << 1)) >> 1; i++)
+ {
+ TRACE_CABAC_CTXT("pcm_sample_chroma", ps_cabac->u4_range, 0);
+ BITS_PARSE("pcm_sample_chroma", value, ps_bitstrm, num_bits);
+
+ // ps_pcmsample_t->i1_pcm_sample_chroma[i] = value;
+ *pu1_coeff_data++ = value << (BIT_DEPTH_CHROMA - num_bits);
+ }
+
+ ps_codec->s_parse.pv_tu_coeff_data = pu1_coeff_data;
+
+ return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses Prediction unit
+ *
+ * @par Description:
+ * Parses Prediction unit as per Section:7.3.9.6
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T ihevcd_parse_pu_mvp(codec_t *ps_codec, pu_t *ps_pu)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+ slice_header_t *ps_slice_hdr;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+ WORD32 inter_pred_idc;
+
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+ if(ps_slice_hdr->i1_slice_type == BSLICE)
+ {
+ WORD32 pu_w_plus_pu_h;
+ WORD32 ctxt_idx;
+ /* required to check if w+h==12 case */
+ pu_w_plus_pu_h = ((ps_pu->b4_wd + 1) << 2) + ((ps_pu->b4_ht + 1) << 2);
+ if(12 == pu_w_plus_pu_h)
+ {
+ ctxt_idx = IHEVC_CAB_INTER_PRED_IDC + 4;
+ TRACE_CABAC_CTXT("inter_pred_idc", ps_cabac->u4_range, ctxt_idx);
+ inter_pred_idc = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+ ctxt_idx);
+ }
+ else
+ {
+ /* larger PUs can be encoded as bi_pred/l0/l1 inter_pred_idc */
+ WORD32 is_bipred;
+
+ ctxt_idx = IHEVC_CAB_INTER_PRED_IDC + ps_codec->s_parse.i4_ct_depth;
+ TRACE_CABAC_CTXT("inter_pred_idc", ps_cabac->u4_range, ctxt_idx);
+ is_bipred = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ inter_pred_idc = PRED_BI;
+ if(!is_bipred)
+ {
+ ctxt_idx = IHEVC_CAB_INTER_PRED_IDC + 4;
+ inter_pred_idc = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+ ctxt_idx);
+ }
+ }
+
+ AEV_TRACE("inter_pred_idc", inter_pred_idc, ps_cabac->u4_range);
+ }
+ else
+ inter_pred_idc = PRED_L0;
+ ps_pu->mv.i1_l0_ref_idx = 0;
+ ps_pu->mv.i1_l1_ref_idx = 0;
+ /* Decode MVD for L0 for PRED_L0 or PRED_BI */
+ if(inter_pred_idc != PRED_L1)
+ {
+ WORD32 active_refs = ps_slice_hdr->i1_num_ref_idx_l0_active;
+ WORD32 ref_idx = 0;
+ WORD32 ctxt_idx;
+
+ if(active_refs > 1)
+ {
+ ctxt_idx = IHEVC_CAB_INTER_REF_IDX;
+ /* encode the context modelled first bin */
+ TRACE_CABAC_CTXT("ref_idx", ps_cabac->u4_range, ctxt_idx);
+ ref_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+
+ if((active_refs > 2) && ref_idx)
+ {
+ WORD32 value;
+ /* encode the context modelled second bin */
+ ctxt_idx++;
+ value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ ref_idx += value;
+ if((active_refs > 3) && value)
+ {
+ /* encode remaining bypass bins */
+ ref_idx = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac,
+ ps_bitstrm,
+ (active_refs - 3)
+ );
+ ref_idx += 2;
+ }
+ }
+ AEV_TRACE("ref_idx", ref_idx, ps_cabac->u4_range);
+ }
+
+ ref_idx = CLIP3(ref_idx, 0, MAX_DPB_SIZE - 1);
+ ps_pu->mv.i1_l0_ref_idx = ref_idx;
+
+ ihevcd_parse_mvd(ps_codec, &ps_pu->mv.s_l0_mv);
+
+ ctxt_idx = IHEVC_CAB_MVP_L0L1;
+ value = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ ctxt_idx);
+
+ AEV_TRACE("mvp_l0/l1_flag", value, ps_cabac->u4_range);
+
+ ps_pu->b1_l0_mvp_idx = value;
+
+ }
+ /* Decode MVD for L1 for PRED_L1 or PRED_BI */
+ if(inter_pred_idc != PRED_L0)
+ {
+ WORD32 active_refs = ps_slice_hdr->i1_num_ref_idx_l1_active;
+ WORD32 ref_idx = 0;
+ WORD32 ctxt_idx;
+
+ if(active_refs > 1)
+ {
+
+ ctxt_idx = IHEVC_CAB_INTER_REF_IDX;
+ TRACE_CABAC_CTXT("ref_idx", ps_cabac->u4_range, ctxt_idx);
+ /* encode the context modelled first bin */
+ ref_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+
+ if((active_refs > 2) && ref_idx)
+ {
+ WORD32 value;
+ /* encode the context modelled second bin */
+ ctxt_idx++;
+ value = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ ref_idx += value;
+ if((active_refs > 3) && value)
+ {
+ /* encode remaining bypass bins */
+ ref_idx = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac,
+ ps_bitstrm,
+ (active_refs - 3)
+ );
+ ref_idx += 2;
+ }
+ }
+
+ AEV_TRACE("ref_idx", ref_idx, ps_cabac->u4_range);
+ }
+
+ ref_idx = CLIP3(ref_idx, 0, MAX_DPB_SIZE - 1);
+ ps_pu->mv.i1_l1_ref_idx = ref_idx;
+
+ if(ps_slice_hdr->i1_mvd_l1_zero_flag && inter_pred_idc == PRED_BI)
+ {
+ ps_pu->mv.s_l1_mv.i2_mvx = 0;
+ ps_pu->mv.s_l1_mv.i2_mvy = 0;
+ }
+ else
+ {
+ ihevcd_parse_mvd(ps_codec, &ps_pu->mv.s_l1_mv);
+ }
+
+ ctxt_idx = IHEVC_CAB_MVP_L0L1;
+ value = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ ctxt_idx);
+
+ AEV_TRACE("mvp_l0/l1_flag", value, ps_cabac->u4_range);
+ ps_pu->b1_l1_mvp_idx = value;
+
+ }
+
+ ps_pu->b2_pred_mode = inter_pred_idc;
+ return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses Prediction unit
+ *
+ * @par Description:
+ * Parses Prediction unit as per Section:7.3.9.6
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T ihevcd_parse_prediction_unit(codec_t *ps_codec,
+ WORD32 x0,
+ WORD32 y0,
+ WORD32 wd,
+ WORD32 ht)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ slice_header_t *ps_slice_hdr;
+ sps_t *ps_sps;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD32 ctb_x_base;
+ WORD32 ctb_y_base;
+
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+ /* Set PU structure to default values */
+ memset(ps_pu, 0, sizeof(pu_t));
+
+ ps_sps = ps_codec->s_parse.ps_sps;
+ ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+ ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+ ps_pu->b4_pos_x = (x0 - ctb_x_base) >> 2;
+ ps_pu->b4_pos_y = (y0 - ctb_y_base) >> 2;
+ ps_pu->b4_wd = (wd >> 2) - 1;
+ ps_pu->b4_ht = (ht >> 2) - 1;
+
+ ps_pu->b1_intra_flag = 0;
+ ps_pu->b3_part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
+
+ if(PRED_MODE_SKIP == ps_codec->s_parse.s_cu.i4_pred_mode)
+ {
+ WORD32 merge_idx = 0;
+ if(ps_slice_hdr->i1_max_num_merge_cand > 1)
+ {
+ WORD32 ctxt_idx = IHEVC_CAB_MERGE_IDX_EXT;
+ WORD32 bin;
+
+ TRACE_CABAC_CTXT("merge_idx", ps_cabac->u4_range, ctxt_idx);
+ bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ if(bin)
+ {
+ if(ps_slice_hdr->i1_max_num_merge_cand > 2)
+ {
+ merge_idx = ihevcd_cabac_decode_bypass_bins_tunary(
+ ps_cabac, ps_bitstrm,
+ (ps_slice_hdr->i1_max_num_merge_cand - 2));
+ }
+ merge_idx++;
+ }
+ AEV_TRACE("merge_idx", merge_idx, ps_cabac->u4_range);
+ }
+ ps_pu->b1_merge_flag = 1;
+ ps_pu->b3_merge_idx = merge_idx;
+
+ }
+ else
+ {
+ /* MODE_INTER */
+ WORD32 merge_flag;
+ WORD32 ctxt_idx = IHEVC_CAB_MERGE_FLAG_EXT;
+ TRACE_CABAC_CTXT("merge_flag", ps_cabac->u4_range, ctxt_idx);
+ merge_flag = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("merge_flag", merge_flag, ps_cabac->u4_range);
+
+ ps_pu->b1_merge_flag = merge_flag;
+
+ if(merge_flag)
+ {
+ WORD32 merge_idx = 0;
+ if(ps_slice_hdr->i1_max_num_merge_cand > 1)
+ {
+ WORD32 ctxt_idx = IHEVC_CAB_MERGE_IDX_EXT;
+ WORD32 bin;
+ TRACE_CABAC_CTXT("merge_idx", ps_cabac->u4_range, ctxt_idx);
+ bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ if(bin)
+ {
+ if(ps_slice_hdr->i1_max_num_merge_cand > 2)
+ {
+ merge_idx = ihevcd_cabac_decode_bypass_bins_tunary(
+ ps_cabac, ps_bitstrm,
+ (ps_slice_hdr->i1_max_num_merge_cand - 2));
+ }
+ merge_idx++;
+ }
+ AEV_TRACE("merge_idx", merge_idx, ps_cabac->u4_range);
+ }
+
+ ps_pu->b3_merge_idx = merge_idx;
+ }
+ else
+ {
+ ihevcd_parse_pu_mvp(ps_codec, ps_pu);
+ }
+
+ }
+ STATS_UPDATE_PU_SIZE(ps_pu);
+ /* Increment PU pointer */
+ ps_codec->s_parse.ps_pu++;
+ ps_codec->s_parse.i4_pic_pu_idx++;
+ return ret;
+}
+
+
+WORD32 ihevcd_parse_part_mode_amp(cab_ctxt_t *ps_cabac, bitstrm_t *ps_bitstrm)
+{
+ WORD32 ctxt_idx = IHEVC_CAB_PART_MODE;
+ WORD32 part_mode_idx;
+ WORD32 part_mode;
+ WORD32 bin;
+
+ part_mode = 0;
+ TRACE_CABAC_CTXT("part_mode", ps_cabac->u4_range, ctxt_idx);
+ bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx++);
+
+ if(!bin)
+ {
+ bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx++);
+ part_mode_idx = bin;
+ part_mode_idx <<= 1;
+
+ /* Following takes of handling context increment for 3rd bin in part_mode */
+ /* When AMP is enabled and the current is not min CB */
+ /* Context for 3rd bin is 3 and not 2 */
+ ctxt_idx += 1;
+
+ bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ part_mode_idx |= bin;
+
+ part_mode_idx <<= 1;
+ if(!bin)
+ {
+
+ bin = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+ part_mode_idx |= bin;
+ }
+ part_mode = gau1_part_mode_amp[part_mode_idx];
+ }
+ return part_mode;
+}
+IHEVCD_ERROR_T ihevcd_parse_coding_unit_intra(codec_t *ps_codec,
+ WORD32 x0,
+ WORD32 y0,
+ WORD32 log2_cb_size)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ sps_t *ps_sps;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD32 pcm_flag;
+ WORD32 value;
+ WORD32 cb_size = 1 << log2_cb_size;
+ WORD32 part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
+ tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ WORD32 ctb_x_base;
+ WORD32 ctb_y_base;
+ ps_sps = ps_codec->s_parse.ps_sps;
+ ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+ ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+ memset(ps_pu, 0, sizeof(pu_t));
+ ps_pu->b1_intra_flag = 1;
+ ps_pu->b4_wd = (cb_size >> 2) - 1;
+ ps_pu->b4_ht = (cb_size >> 2) - 1;
+ ps_pu->b4_pos_x = (x0 - ctb_x_base) >> 2;
+ ps_pu->b4_pos_y = (y0 - ctb_y_base) >> 2;
+
+ pcm_flag = 0;
+ if((PART_2Nx2N == part_mode) && (ps_sps->i1_pcm_enabled_flag)
+ && (log2_cb_size
+ >= ps_sps->i1_log2_min_pcm_coding_block_size)
+ && (log2_cb_size
+ <= (ps_sps->i1_log2_min_pcm_coding_block_size + ps_sps->i1_log2_diff_max_min_pcm_coding_block_size)))
+ {
+ TRACE_CABAC_CTXT("pcm_flag", ps_cabac->u4_range, 0);
+ pcm_flag = ihevcd_cabac_decode_terminate(ps_cabac, ps_bitstrm);
+ AEV_TRACE("pcm_flag", pcm_flag, ps_cabac->u4_range);
+ }
+
+ ps_codec->s_parse.i4_cu_pcm_flag = pcm_flag;
+ if(pcm_flag)
+ {
+ UWORD8 *pu1_luma_intra_pred_mode_top, *pu1_luma_intra_pred_mode_left;
+ WORD32 i, num_pred_blocks;
+
+ if(ps_codec->s_parse.s_bitstrm.u4_bit_ofst % 8)
+ {
+ TRACE_CABAC_CTXT("pcm_alignment_zero_bit", ps_cabac->u4_range, 0);
+ ihevcd_bits_flush_to_byte_boundary(&ps_codec->s_parse.s_bitstrm);
+ AEV_TRACE("pcm_alignment_zero_bit", 0, ps_cabac->u4_range);
+ }
+
+ ihevcd_parse_pcm_sample(ps_codec, x0, y0, log2_cb_size);
+
+ ihevcd_cabac_reset(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm);
+
+ ps_tu = ps_codec->s_parse.ps_tu;
+ ps_tu->b1_cb_cbf = 1;
+ ps_tu->b1_cr_cbf = 1;
+ ps_tu->b1_y_cbf = 1;
+ ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+ ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+ ps_tu->b1_transquant_bypass = 1;
+ ps_tu->b3_size = (log2_cb_size - 2);
+ ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+ ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+ ps_tu->b6_luma_intra_mode = INTRA_PRED_NONE;
+
+ /* Set the first TU in CU flag */
+ {
+ if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+ (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+ {
+ ps_tu->b1_first_tu_in_cu = 1;
+ }
+ else
+ {
+ ps_tu->b1_first_tu_in_cu = 0;
+ }
+ }
+
+ /* Update the intra pred mode for PCM to INTRA_DC(default mode) */
+ pu1_luma_intra_pred_mode_top = ps_codec->s_parse.pu1_luma_intra_pred_mode_top
+ + (ps_codec->s_parse.s_cu.i4_pos_x * 2);
+
+ pu1_luma_intra_pred_mode_left = ps_codec->s_parse.pu1_luma_intra_pred_mode_left
+ + (ps_codec->s_parse.s_cu.i4_pos_y * 2);
+
+ num_pred_blocks = 1; /* Because PCM part mode will be 2Nx2N */
+
+ ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_left, INTRA_DC, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+ ps_codec->s_func_selector.ihevc_memset_fptr(pu1_luma_intra_pred_mode_top, INTRA_DC, (cb_size / num_pred_blocks) / MIN_PU_SIZE);
+
+
+ /* Set no_loop_filter appropriately */
+ if(1 == ps_sps->i1_pcm_loop_filter_disable_flag)
+ {
+ UWORD8 *pu1_pic_no_loop_filter_flag;
+ WORD32 numbytes_row;
+ UWORD32 u4_mask;
+
+ pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+ numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+ pu1_pic_no_loop_filter_flag += (y0 / 8) * numbytes_row;
+ pu1_pic_no_loop_filter_flag += (x0 / 64);
+ /* Generate (cb_size / 8) number of 1s */
+ /* i.e (log2_cb_size - 2) number of 1s */
+ u4_mask = LSB_ONES((cb_size >> 3));
+ for(i = 0; i < (cb_size / 8); i++)
+ {
+ *pu1_pic_no_loop_filter_flag |= (u4_mask << (((x0) / 8) % 8));
+ pu1_pic_no_loop_filter_flag += numbytes_row;
+ }
+ }
+ /* Increment ps_tu and tu_idx */
+ ps_codec->s_parse.ps_tu++;
+ ps_codec->s_parse.s_cu.i4_tu_cnt++;
+ ps_codec->s_parse.i4_pic_tu_idx++;
+
+ }
+ else
+ {
+ WORD32 cnt = 0;
+ WORD32 i;
+ WORD32 part_cnt;
+
+ part_cnt = (part_mode == PART_NxN) ? 4 : 1;
+
+ for(i = 0; i < part_cnt; i++)
+ {
+ TRACE_CABAC_CTXT("prev_intra_pred_luma_flag", ps_cabac->u4_range, IHEVC_CAB_INTRA_LUMA_PRED_FLAG);
+ value = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ IHEVC_CAB_INTRA_LUMA_PRED_FLAG);
+
+ ps_codec->s_parse.s_cu.ai4_prev_intra_luma_pred_flag[i] =
+ value;
+ AEV_TRACE("prev_intra_pred_luma_flag", value, ps_cabac->u4_range);
+ }
+
+ for(i = 0; i < part_cnt; i++)
+ {
+ if(ps_codec->s_parse.s_cu.ai4_prev_intra_luma_pred_flag[cnt])
+ {
+ value = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac, ps_bitstrm, 2);
+ AEV_TRACE("mpm_idx", value, ps_cabac->u4_range);
+ ps_codec->s_parse.s_cu.ai4_mpm_idx[cnt] = value;
+ }
+ else
+ {
+ value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 5);
+ AEV_TRACE("rem_intra_luma_pred_mode", value,
+ ps_cabac->u4_range);
+ ps_codec->s_parse.s_cu.ai4_rem_intra_luma_pred_mode[cnt] =
+ value;
+ }
+ cnt++;
+ }
+ TRACE_CABAC_CTXT("intra_chroma_pred_mode", ps_cabac->u4_range, IHEVC_CAB_CHROMA_PRED_MODE);
+ value = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ IHEVC_CAB_CHROMA_PRED_MODE);
+ ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx = 4;
+ if(value)
+ {
+ ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx =
+ ihevcd_cabac_decode_bypass_bins(ps_cabac,
+ ps_bitstrm, 2);
+ }
+ AEV_TRACE("intra_chroma_pred_mode",
+ ps_codec->s_parse.s_cu.i4_intra_chroma_pred_mode_idx,
+ ps_cabac->u4_range);
+
+
+ ihevcd_intra_pred_mode_prediction(ps_codec, log2_cb_size, x0, y0);
+ }
+ STATS_UPDATE_PU_SIZE(ps_pu);
+ /* Increment PU pointer */
+ ps_codec->s_parse.ps_pu++;
+ ps_codec->s_parse.i4_pic_pu_idx++;
+
+ return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses coding unit
+ *
+ * @par Description:
+ * Parses coding unit as per Section:7.3.9.5
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+
+IHEVCD_ERROR_T ihevcd_parse_coding_unit(codec_t *ps_codec,
+ WORD32 x0,
+ WORD32 y0,
+ WORD32 log2_cb_size)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ WORD32 cb_size;
+ slice_header_t *ps_slice_hdr;
+ WORD32 skip_flag;
+ WORD32 pcm_flag;
+ UWORD32 *pu4_skip_top = ps_codec->s_parse.pu4_skip_cu_top;
+ UWORD32 u4_skip_left = ps_codec->s_parse.u4_skip_cu_left;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+
+ WORD32 cu_pos_x;
+ WORD32 cu_pos_y;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+
+ ASSERT(0 == (x0 % 8));
+ ASSERT(0 == (y0 % 8));
+
+ ps_codec->s_parse.s_cu.i4_tu_cnt = 0;
+ ps_sps = ps_codec->s_parse.ps_sps;
+ ps_pps = ps_codec->s_parse.ps_pps;
+
+ cu_pos_x = ps_codec->s_parse.s_cu.i4_pos_x;
+ cu_pos_y = ps_codec->s_parse.s_cu.i4_pos_y;
+
+
+
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+
+ cb_size = 1 << log2_cb_size;
+
+ ps_codec->s_parse.s_cu.i4_cu_transquant_bypass = 0;
+
+ if(ps_pps->i1_transquant_bypass_enable_flag)
+ {
+ TRACE_CABAC_CTXT("cu_transquant_bypass_flag", ps_cabac->u4_range, IHEVC_CAB_CU_TQ_BYPASS_FLAG);
+ ps_codec->s_parse.s_cu.i4_cu_transquant_bypass =
+ ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+ IHEVC_CAB_CU_TQ_BYPASS_FLAG);
+ /* Update transquant_bypass in ps_tu */
+
+ AEV_TRACE("cu_transquant_bypass_flag", ps_codec->s_parse.s_cu.i4_cu_transquant_bypass,
+ ps_cabac->u4_range);
+
+ if(ps_codec->s_parse.s_cu.i4_cu_transquant_bypass)
+ {
+ UWORD8 *pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+ UWORD32 u4_mask;
+ WORD32 i;
+ WORD32 numbytes_row;
+ numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+ pu1_pic_no_loop_filter_flag += (y0 / 8) * numbytes_row;
+ pu1_pic_no_loop_filter_flag += (x0 / 64);
+
+ /* Generate (cb_size / 8) number of 1s */
+ /* i.e (log2_cb_size - 2) number of 1s */
+ u4_mask = LSB_ONES((cb_size >> 3));
+ for(i = 0; i < (cb_size / 8); i++)
+ {
+ *pu1_pic_no_loop_filter_flag |= (u4_mask << (((x0) / 8) % 8));
+ pu1_pic_no_loop_filter_flag += numbytes_row;
+ }
+ }
+ }
+
+ {
+ UWORD32 u4_skip_top = 0;
+ UWORD32 u4_mask;
+ UWORD32 u4_top_mask, u4_left_mask;
+ UWORD32 u4_min_cu_x = x0 / 8;
+ UWORD32 u4_min_cu_y = y0 / 8;
+
+ pu4_skip_top += (u4_min_cu_x / 32);
+
+
+ if(ps_slice_hdr->i1_slice_type != ISLICE)
+ {
+ WORD32 ctx_idx_inc;
+ ctx_idx_inc = 0;
+
+ if((0 != cu_pos_y) ||
+ ((0 != ps_codec->s_parse.i4_ctb_slice_y) &&
+ (0 != ps_codec->s_parse.i4_ctb_tile_y)))
+ {
+ u4_skip_top = *pu4_skip_top;
+ u4_skip_top >>= (u4_min_cu_x % 32);
+ if(u4_skip_top & 1)
+ ctx_idx_inc++;
+ }
+
+ /*****************************************************************/
+ /* If cu_pos_x is non-zero then left is available */
+ /* If cu_pos_x is zero then ensure both the following are true */
+ /* Current CTB is not the first CTB in a tile row */
+ /* Current CTB is not the first CTB in a slice */
+ /*****************************************************************/
+ if((0 != cu_pos_x) ||
+ (((0 != ps_codec->s_parse.i4_ctb_slice_x) || (0 != ps_codec->s_parse.i4_ctb_slice_y)) &&
+ (0 != ps_codec->s_parse.i4_ctb_tile_x)))
+ {
+ u4_skip_left >>= (u4_min_cu_y % 32);
+ if(u4_skip_left & 1)
+ ctx_idx_inc++;
+ }
+ TRACE_CABAC_CTXT("cu_skip_flag", ps_cabac->u4_range, (IHEVC_CAB_SKIP_FLAG + ctx_idx_inc));
+ skip_flag = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ (IHEVC_CAB_SKIP_FLAG + ctx_idx_inc));
+
+ AEV_TRACE("cu_skip_flag", skip_flag, ps_cabac->u4_range);
+ }
+ else
+ skip_flag = 0;
+
+ /* Update top skip_flag */
+ u4_skip_top = *pu4_skip_top;
+ /* Since Max cb_size is 64, maximum of 8 bits will be set or reset */
+ /* Also since Coding block will be within 64x64 grid, only 8bits within a WORD32
+ * need to be updated. These 8 bits will not cross 8 bit boundaries
+ */
+ u4_mask = LSB_ONES(cb_size / 8);
+ u4_top_mask = u4_mask << (u4_min_cu_x % 32);
+
+
+ if(skip_flag)
+ {
+ u4_skip_top |= u4_top_mask;
+ }
+ else
+ {
+ u4_skip_top &= ~u4_top_mask;
+ }
+ *pu4_skip_top = u4_skip_top;
+
+ /* Update left skip_flag */
+ u4_skip_left = ps_codec->s_parse.u4_skip_cu_left;
+ u4_mask = LSB_ONES(cb_size / 8);
+ u4_left_mask = u4_mask << (u4_min_cu_y % 32);
+
+ if(skip_flag)
+ {
+ u4_skip_left |= u4_left_mask;
+ }
+ else
+ {
+ u4_skip_left &= ~u4_left_mask;
+ }
+ ps_codec->s_parse.u4_skip_cu_left = u4_skip_left;
+ }
+ ps_codec->s_parse.i4_cu_pcm_flag = 0;
+
+ if(skip_flag)
+ {
+ WORD32 ctb_x_base;
+ WORD32 ctb_y_base;
+
+ ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+ ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+ ps_tu->b1_cb_cbf = 0;
+ ps_tu->b1_cr_cbf = 0;
+ ps_tu->b1_y_cbf = 0;
+ ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+ ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+ ps_tu->b1_transquant_bypass = 0;
+ ps_tu->b3_size = (log2_cb_size - 2);
+ ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+ ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+ ps_tu->b6_luma_intra_mode = INTRA_PRED_NONE;
+
+ /* Set the first TU in CU flag */
+ {
+ if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+ (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+ {
+ ps_tu->b1_first_tu_in_cu = 1;
+ }
+ else
+ {
+ ps_tu->b1_first_tu_in_cu = 0;
+ }
+ }
+
+ ps_codec->s_parse.ps_tu++;
+ ps_codec->s_parse.s_cu.i4_tu_cnt++;
+ ps_codec->s_parse.i4_pic_tu_idx++;
+
+ ps_codec->s_parse.s_cu.i4_pred_mode = PRED_MODE_SKIP;
+ ps_codec->s_parse.s_cu.i4_part_mode = PART_2Nx2N;
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ ps_pu->b2_part_idx = 0;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size);
+ STATS_UPDATE_PU_SKIP_SIZE(ps_pu);
+ }
+ }
+ else
+ {
+ WORD32 pred_mode;
+ WORD32 part_mode;
+ WORD32 intra_split_flag;
+ WORD32 is_mincb;
+ cb_size = (1 << log2_cb_size);
+ is_mincb = (cb_size == (1 << ps_sps->i1_log2_min_coding_block_size));
+ pcm_flag = 0;
+ if(ps_slice_hdr->i1_slice_type != ISLICE)
+ {
+ TRACE_CABAC_CTXT("pred_mode_flag", ps_cabac->u4_range, IHEVC_CAB_PRED_MODE);
+ pred_mode = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ IHEVC_CAB_PRED_MODE);
+
+ AEV_TRACE("pred_mode_flag", pred_mode, ps_cabac->u4_range);
+ }
+ else
+ {
+ pred_mode = PRED_MODE_INTRA;
+ }
+
+ /* If current CU is intra then set corresponging bit in picture level intra map */
+ if(PRED_MODE_INTRA == pred_mode)
+ {
+ UWORD8 *pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+ UWORD32 u4_mask;
+ WORD32 i;
+ WORD32 numbytes_row;
+ numbytes_row = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+ pu1_pic_intra_flag += (y0 / 8) * numbytes_row;
+ pu1_pic_intra_flag += (x0 / 64);
+
+ /* Generate (cb_size / 8) number of 1s */
+ /* i.e (log2_cb_size - 2) number of 1s */
+ u4_mask = LSB_ONES((cb_size >> 3));
+ for(i = 0; i < (cb_size / 8); i++)
+ {
+ *pu1_pic_intra_flag |= (u4_mask << (((x0) / 8) % 8));
+ pu1_pic_intra_flag += numbytes_row;
+ }
+ }
+
+ ps_codec->s_parse.s_cu.i4_pred_mode = pred_mode;
+ intra_split_flag = 0;
+ if((PRED_MODE_INTRA != pred_mode) ||
+ is_mincb)
+ {
+ UWORD32 bin;
+ if(PRED_MODE_INTRA == pred_mode)
+ {
+ TRACE_CABAC_CTXT("part_mode", ps_cabac->u4_range, IHEVC_CAB_PART_MODE);
+ bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, IHEVC_CAB_PART_MODE);
+ part_mode = (bin) ? PART_2Nx2N : PART_NxN;
+ }
+ else
+ {
+ WORD32 amp_enabled = ps_sps->i1_amp_enabled_flag;
+
+ UWORD32 u4_max_bin_cnt = 0;
+
+
+
+ if(amp_enabled && !is_mincb)
+ {
+ part_mode = ihevcd_parse_part_mode_amp(ps_cabac, ps_bitstrm);
+ }
+ else
+ {
+ WORD32 ctxt_inc = IHEVC_CAB_PART_MODE;
+
+ u4_max_bin_cnt = 2;
+ if((is_mincb) && (cb_size > 8))
+ {
+ u4_max_bin_cnt++;
+ }
+
+ part_mode = -1;
+ TRACE_CABAC_CTXT("part_mode", ps_cabac->u4_range, IHEVC_CAB_PART_MODE);
+ do
+ {
+ bin = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm,
+ ctxt_inc++);
+ part_mode++;
+ }while(--u4_max_bin_cnt && !bin);
+
+ /* If the last bin was zero, then increment part mode by 1 */
+ if(!bin)
+ part_mode++;
+ }
+
+
+ }
+
+ AEV_TRACE("part_mode", part_mode, ps_cabac->u4_range);
+
+ }
+ else
+ {
+ part_mode = 0;
+ intra_split_flag = 0;
+ }
+ ps_codec->s_parse.s_cu.i4_part_mode = part_mode;
+
+ if((PRED_MODE_INTRA == ps_codec->s_parse.s_cu.i4_pred_mode) &&
+ (PART_NxN == ps_codec->s_parse.s_cu.i4_part_mode))
+ {
+ intra_split_flag = 1;
+ }
+ ps_codec->s_parse.s_cu.i4_part_mode = part_mode;
+ ps_codec->s_parse.s_cu.i4_intra_split_flag = intra_split_flag;
+ if(pred_mode == PRED_MODE_INTRA)
+ {
+ ps_codec->s_parse.i4_cu_pcm_flag = 0;
+ ihevcd_parse_coding_unit_intra(ps_codec, x0, y0, log2_cb_size);
+ pcm_flag = ps_codec->s_parse.i4_cu_pcm_flag;
+
+ }
+ else
+ {
+ if(part_mode == PART_2Nx2N)
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size);
+ ps_pu->b2_part_idx = 0;
+ }
+ else if(part_mode == PART_2NxN)
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size / 2);
+ ps_pu->b2_part_idx = 0;
+
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size / 2), cb_size, cb_size / 2);
+
+ ps_pu->b2_part_idx = 1;
+ }
+ else if(part_mode == PART_Nx2N)
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size / 2, cb_size);
+ ps_pu->b2_part_idx = 0;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 2), y0, cb_size / 2, cb_size);
+
+ ps_pu->b2_part_idx = 1;
+ }
+ else if(part_mode == PART_2NxnU)
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size / 4);
+ ps_pu->b2_part_idx = 0;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size / 4), cb_size, cb_size * 3 / 4);
+
+ ps_pu->b2_part_idx = 1;
+ }
+ else if(part_mode == PART_2NxnD)
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size, cb_size * 3 / 4);
+ ps_pu->b2_part_idx = 0;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size * 3 / 4), cb_size, cb_size / 4);
+
+ ps_pu->b2_part_idx = 1;
+ }
+ else if(part_mode == PART_nLx2N)
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size / 4, cb_size);
+ ps_pu->b2_part_idx = 0;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 4), y0, cb_size * 3 / 4, cb_size);
+
+ ps_pu->b2_part_idx = 1;
+ }
+ else if(part_mode == PART_nRx2N)
+ {
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size * 3 / 4, cb_size);
+ ps_pu->b2_part_idx = 0;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size * 3 / 4), y0, cb_size / 4, cb_size);
+ ps_pu->b2_part_idx = 1;
+ }
+ else
+ { /* PART_NxN */
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0, cb_size / 2, cb_size / 2);
+ ps_pu->b2_part_idx = 0;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 2), y0, cb_size / 2, cb_size / 2);
+
+ ps_pu->b2_part_idx = 1;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0, y0 + (cb_size / 2), cb_size / 2, cb_size / 2);
+
+ ps_pu->b2_part_idx = 2;
+ ps_pu = ps_codec->s_parse.ps_pu;
+ ihevcd_parse_prediction_unit(ps_codec, x0 + (cb_size / 2), y0 + (cb_size / 2), cb_size / 2, cb_size / 2);
+
+ ps_pu->b2_part_idx = 3;
+ }
+ }
+
+ if(!pcm_flag)
+ {
+ WORD32 no_residual_syntax_flag = 0;
+ pu_t *ps_pu;
+ /* Since ps_pu is incremented for each PU parsed, decrement by 1 to
+ * access last decoded PU
+ */
+ ps_pu = ps_codec->s_parse.ps_pu - 1;
+ if((PRED_MODE_INTRA != pred_mode) &&
+ (!((part_mode == PART_2Nx2N) && ps_pu->b1_merge_flag)))
+ {
+
+ TRACE_CABAC_CTXT("rqt_root_cbf", ps_cabac->u4_range, IHEVC_CAB_NORES_IDX);
+ no_residual_syntax_flag = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ IHEVC_CAB_NORES_IDX);
+
+ AEV_TRACE("rqt_root_cbf", no_residual_syntax_flag,
+ ps_cabac->u4_range);
+ /* TODO: HACK FOR COMPLIANCE WITH HM REFERENCE DECODER */
+ /*********************************************************/
+ /* currently the HM decoder expects qtroot cbf instead of */
+ /* no_residue_flag which has opposite meaning */
+ /* This will be fixed once the software / spec is fixed */
+ /*********************************************************/
+ no_residual_syntax_flag = 1 - no_residual_syntax_flag;
+ }
+
+ if(!no_residual_syntax_flag)
+ {
+
+ ps_codec->s_parse.s_cu.i4_max_trafo_depth = (pred_mode == PRED_MODE_INTRA) ?
+ (ps_sps->i1_max_transform_hierarchy_depth_intra + intra_split_flag) :
+ (ps_sps->i1_max_transform_hierarchy_depth_inter);
+ ihevcd_parse_transform_tree(ps_codec, x0, y0, x0, y0,
+ log2_cb_size, 0, 0,
+ ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0]);
+ }
+ else
+ {
+ WORD32 ctb_x_base;
+ WORD32 ctb_y_base;
+
+ ctb_x_base = ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size;
+ ctb_y_base = ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size;
+
+ ps_tu = ps_codec->s_parse.ps_tu;
+ ps_tu->b1_cb_cbf = 0;
+ ps_tu->b1_cr_cbf = 0;
+ ps_tu->b1_y_cbf = 0;
+ ps_tu->b4_pos_x = ((x0 - ctb_x_base) >> 2);
+ ps_tu->b4_pos_y = ((y0 - ctb_y_base) >> 2);
+ ps_tu->b1_transquant_bypass = 0;
+ ps_tu->b3_size = (log2_cb_size - 2);
+ ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+ ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+ ps_tu->b6_luma_intra_mode = ps_codec->s_parse.s_cu.ai4_intra_luma_pred_mode[0];
+
+ /* Set the first TU in CU flag */
+ {
+ if((ps_codec->s_parse.s_cu.i4_pos_x << 3) == (ps_tu->b4_pos_x << 2) &&
+ (ps_codec->s_parse.s_cu.i4_pos_y << 3) == (ps_tu->b4_pos_y << 2))
+ {
+ ps_tu->b1_first_tu_in_cu = 1;
+ }
+ else
+ {
+ ps_tu->b1_first_tu_in_cu = 0;
+ }
+ }
+ ps_codec->s_parse.ps_tu++;
+ ps_codec->s_parse.s_cu.i4_tu_cnt++;
+ ps_codec->s_parse.i4_pic_tu_idx++;
+
+ }
+ }
+
+ }
+
+
+
+
+ return ret;
+}
+
+
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses Coding Quad Tree
+ *
+ * @par Description:
+ * Parses Coding Quad Tree as per Section:7.3.9.4
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_parse_coding_quadtree(codec_t *ps_codec,
+ WORD32 x0,
+ WORD32 y0,
+ WORD32 log2_cb_size,
+ WORD32 ct_depth)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ WORD32 split_cu_flag;
+ WORD32 x1, y1;
+ WORD32 cu_pos_x;
+ WORD32 cu_pos_y;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+ WORD32 cb_size = 1 << log2_cb_size;
+ ps_sps = ps_codec->s_parse.ps_sps;
+ ps_pps = ps_codec->s_parse.ps_pps;
+
+ /* Compute CU position with respect to current CTB in (8x8) units */
+ cu_pos_x = (x0 - (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size)) >> 3;
+ cu_pos_y = (y0 - (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size)) >> 3;
+
+ ps_codec->s_parse.s_cu.i4_pos_x = cu_pos_x;
+ ps_codec->s_parse.s_cu.i4_pos_y = cu_pos_y;
+
+ ps_codec->s_parse.s_cu.i4_log2_cb_size = log2_cb_size;
+
+ ps_codec->s_parse.i4_ct_depth = ct_depth;
+ {
+ UWORD32 *pu4_ct_depth_top = ps_codec->s_parse.pu4_ct_depth_top;
+ UWORD32 u4_ct_depth_left = ps_codec->s_parse.u4_ct_depth_left;
+ UWORD32 u4_ct_depth_top = 0;
+ UWORD32 u4_mask;
+ UWORD32 u4_top_mask, u4_left_mask;
+ WORD32 ctxt_idx;
+ UWORD32 u4_min_cu_x = x0 / 8;
+ UWORD32 u4_min_cu_y = y0 / 8;
+
+ pu4_ct_depth_top += (u4_min_cu_x / 16);
+
+
+
+
+ if(((x0 + (1 << log2_cb_size)) <= ps_sps->i2_pic_width_in_luma_samples) &&
+ ((y0 + (1 << log2_cb_size)) <= ps_sps->i2_pic_height_in_luma_samples) &&
+ (log2_cb_size > ps_sps->i1_log2_min_coding_block_size))
+ {
+
+ ctxt_idx = IHEVC_CAB_SPLIT_CU_FLAG;
+ /* Split cu context increment is decided based on left and top Coding tree
+ * depth which is stored at frame level
+ */
+ /* Check if the CTB is in first row in the current slice or tile */
+ if((0 != cu_pos_y) ||
+ ((0 != ps_codec->s_parse.i4_ctb_slice_y) &&
+ (0 != ps_codec->s_parse.i4_ctb_tile_y)))
+ {
+ u4_ct_depth_top = *pu4_ct_depth_top;
+ u4_ct_depth_top >>= ((u4_min_cu_x % 16) * 2);
+ u4_ct_depth_top &= 3;
+
+ if((WORD32)u4_ct_depth_top > ct_depth)
+ ctxt_idx++;
+ }
+
+ /* Check if the CTB is in first column in the current slice or tile */
+ /*****************************************************************/
+ /* If cu_pos_x is non-zero then left is available */
+ /* If cu_pos_x is zero then ensure both the following are true */
+ /* Current CTB is not the first CTB in a tile row */
+ /* Current CTB is not the first CTB in a slice */
+ /*****************************************************************/
+ if((0 != cu_pos_x) ||
+ (((0 != ps_codec->s_parse.i4_ctb_slice_x) || (0 != ps_codec->s_parse.i4_ctb_slice_y)) &&
+ (0 != ps_codec->s_parse.i4_ctb_tile_x)))
+ {
+ u4_ct_depth_left >>= ((u4_min_cu_y % 16) * 2);
+ u4_ct_depth_left &= 3;
+ if((WORD32)u4_ct_depth_left > ct_depth)
+ ctxt_idx++;
+ }
+ TRACE_CABAC_CTXT("split_cu_flag", ps_cabac->u4_range, ctxt_idx);
+ split_cu_flag = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ AEV_TRACE("split_cu_flag", split_cu_flag, ps_cabac->u4_range);
+ }
+ else
+ {
+ if(log2_cb_size > ps_sps->i1_log2_min_coding_block_size)
+ split_cu_flag = 1;
+ else
+ split_cu_flag = 0;
+ }
+
+ if(0 == split_cu_flag)
+ {
+ /* Update top ct_depth */
+ u4_ct_depth_top = *pu4_ct_depth_top;
+ /* Since Max cb_size is 64, maximum of 8 bits will be set or reset */
+ /* Also since Coding block will be within 64x64 grid, only 8bits within a WORD32
+ * need to be updated. These 8 bits will not cross 8 bit boundaries
+ */
+ u4_mask = DUP_LSB_11(cb_size / 8);
+
+ u4_top_mask = u4_mask << ((u4_min_cu_x % 16) * 2);
+ u4_ct_depth_top &= ~u4_top_mask;
+
+ if(ct_depth)
+ {
+ u4_top_mask = gau4_ct_depth_mask[ct_depth] & u4_mask;
+
+ u4_top_mask = u4_top_mask << ((u4_min_cu_x % 16) * 2);
+ u4_ct_depth_top |= u4_top_mask;
+ }
+
+ *pu4_ct_depth_top = u4_ct_depth_top;
+
+ /* Update left ct_depth */
+ u4_ct_depth_left = ps_codec->s_parse.u4_ct_depth_left;
+
+ u4_left_mask = u4_mask << ((u4_min_cu_y % 16) * 2);
+
+ u4_ct_depth_left &= ~u4_left_mask;
+ if(ct_depth)
+ {
+ u4_left_mask = gau4_ct_depth_mask[ct_depth] & u4_mask;
+
+ u4_left_mask = u4_left_mask << ((u4_min_cu_y % 16) * 2);
+ u4_ct_depth_left |= u4_left_mask;
+ }
+
+ ps_codec->s_parse.u4_ct_depth_left = u4_ct_depth_left;
+ }
+ }
+ if((ps_pps->i1_cu_qp_delta_enabled_flag) &&
+ (log2_cb_size >= ps_pps->i1_log2_min_cu_qp_delta_size))
+ {
+ ps_codec->s_parse.i4_is_cu_qp_delta_coded = 0;
+ ps_codec->s_parse.i4_cu_qp_delta = 0;
+ }
+ if(split_cu_flag)
+ {
+ x1 = x0 + ((1 << log2_cb_size) >> 1);
+ y1 = y0 + ((1 << log2_cb_size) >> 1);
+
+ ihevcd_parse_coding_quadtree(ps_codec, x0, y0, log2_cb_size - 1, ct_depth + 1);
+
+ /* At frame boundaries coding quadtree nodes are sent only if they fall within the frame */
+ if(x1 < ps_sps->i2_pic_width_in_luma_samples)
+ ihevcd_parse_coding_quadtree(ps_codec, x1, y0, log2_cb_size - 1, ct_depth + 1);
+
+ if(y1 < ps_sps->i2_pic_height_in_luma_samples)
+ ihevcd_parse_coding_quadtree(ps_codec, x0, y1, log2_cb_size - 1, ct_depth + 1);
+
+ if((x1 < ps_sps->i2_pic_width_in_luma_samples) &&
+ (y1 < ps_sps->i2_pic_height_in_luma_samples))
+ ihevcd_parse_coding_quadtree(ps_codec, x1, y1, log2_cb_size - 1, ct_depth + 1);
+ }
+ else
+ {
+ /* Set current group QP if current CU is aligned with the group */
+ {
+ WORD32 cu_pos_x = ps_codec->s_parse.s_cu.i4_pos_x << 3;
+ WORD32 cu_pos_y = ps_codec->s_parse.s_cu.i4_pos_y << 3;
+
+ WORD32 qpg_x = (cu_pos_x - (cu_pos_x & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1)));
+ WORD32 qpg_y = (cu_pos_y - (cu_pos_y & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1)));
+
+ if((cu_pos_x == qpg_x) &&
+ (cu_pos_y == qpg_y))
+ {
+ ps_codec->s_parse.u4_qpg = ps_codec->s_parse.u4_qp;
+
+ ps_codec->s_parse.s_cu.i4_cu_qp_delta = 0;
+
+ }
+ }
+
+ ihevcd_parse_coding_unit(ps_codec, x0, y0, log2_cb_size);
+
+ if(ps_pps->i1_cu_qp_delta_enabled_flag)
+ {
+ WORD32 qp_pred, qp_left, qp_top;
+ WORD32 cu_pos_x;
+ WORD32 cu_pos_y;
+ WORD32 qpg_x;
+ WORD32 qpg_y;
+ WORD32 i, j;
+ WORD32 qp;
+ WORD32 cur_cu_offset;
+ tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+ WORD32 cb_size = 1 << ps_codec->s_parse.s_cu.i4_log2_cb_size;
+
+ cu_pos_x = ps_codec->s_parse.s_cu.i4_pos_x << 3;
+ cu_pos_y = ps_codec->s_parse.s_cu.i4_pos_y << 3;
+
+ qpg_x = (cu_pos_x - (cu_pos_x & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1))) >> 3;
+ qpg_y = (cu_pos_y - (cu_pos_y & ((1 << ps_pps->i1_log2_min_cu_qp_delta_size) - 1))) >> 3;
+
+ /*previous coded Qp*/
+ qp_left = ps_codec->s_parse.u4_qpg;
+ qp_top = ps_codec->s_parse.u4_qpg;
+
+ if(qpg_x > 0)
+ {
+ qp_left = ps_codec->s_parse.ai1_8x8_cu_qp[qpg_x - 1 + (qpg_y * 8)];
+ }
+ if(qpg_y > 0)
+ {
+ qp_top = ps_codec->s_parse.ai1_8x8_cu_qp[qpg_x + ((qpg_y - 1) * 8)];
+ }
+
+ qp_pred = (qp_left + qp_top + 1) >> 1;
+ /* Since qp_pred + ps_codec->s_parse.s_cu.i4_cu_qp_delta can be negative,
+ 52 is added before taking modulo 52 */
+ qp = (qp_pred + ps_codec->s_parse.s_cu.i4_cu_qp_delta + 52) % 52;
+
+ cur_cu_offset = (cu_pos_x >> 3) + cu_pos_y;
+ for(i = 0; i < (cb_size >> 3); i++)
+ {
+ for(j = 0; j < (cb_size >> 3); j++)
+ {
+ ps_codec->s_parse.ai1_8x8_cu_qp[cur_cu_offset + (i * 8) + j] = qp;
+ }
+ }
+
+ ps_codec->s_parse.u4_qp = qp;
+ ps_codec->s_parse.s_cu.i4_qp = qp;
+
+
+ /* When change in QP is signaled, update the QP in TUs that are already parsed in the CU */
+ {
+ tu_t *ps_tu_tmp;
+ ps_tu_tmp = ps_tu - ps_codec->s_parse.s_cu.i4_tu_cnt;
+ ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+ while(ps_tu_tmp != ps_tu)
+ {
+ ps_tu_tmp->b7_qp = ps_codec->s_parse.u4_qp;
+
+ ps_tu_tmp++;
+ }
+ }
+ if(ps_codec->s_parse.s_cu.i4_cu_qp_delta)
+ {
+ WORD32 ctb_indx;
+ ctb_indx = ps_codec->s_parse.i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_codec->s_parse.i4_ctb_y;
+ ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb[ctb_indx >> 3] &= (~(1 << (ctb_indx & 7)));
+ }
+
+ }
+
+ }
+
+
+
+
+ return ret;
+}
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses SAO (Sample adaptive offset syntax)
+ *
+ * @par Description:
+ * Parses SAO (Sample adaptive offset syntax) as per Section:7.3.9.3
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_parse_sao(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ sps_t *ps_sps;
+ sao_t *ps_sao;
+ WORD32 rx;
+ WORD32 ry;
+ WORD32 value;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD32 sao_merge_left_flag;
+ WORD32 sao_merge_up_flag;
+ slice_header_t *ps_slice_hdr;
+ cab_ctxt_t *ps_cabac = &ps_codec->s_parse.s_cabac;
+ WORD32 ctxt_idx;
+
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base;
+ ps_slice_hdr += (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+
+ ps_sps = (ps_codec->s_parse.ps_sps);
+ rx = ps_codec->s_parse.i4_ctb_x;
+ ry = ps_codec->s_parse.i4_ctb_y;
+
+ ps_sao = ps_codec->s_parse.ps_pic_sao + rx + ry * ps_sps->i2_pic_wd_in_ctb;
+
+ /* Default values */
+ ps_sao->b3_y_type_idx = 0;
+ ps_sao->b3_cb_type_idx = 0;
+ ps_sao->b3_cr_type_idx = 0;
+
+ UNUSED(value);
+ ctxt_idx = IHEVC_CAB_SAO_MERGE;
+ sao_merge_left_flag = 0;
+ sao_merge_up_flag = 0;
+ if(rx > 0)
+ {
+ /*TODO:Implemented only for slice. condition for tile is not tested*/
+ if(((0 != ps_codec->s_parse.i4_ctb_slice_x) || (0 != ps_codec->s_parse.i4_ctb_slice_y)) &&
+ (0 != ps_codec->s_parse.i4_ctb_tile_x))
+ {
+
+ TRACE_CABAC_CTXT("sao_merge_flag", ps_cabac->u4_range, ctxt_idx);
+ sao_merge_left_flag = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ ctxt_idx);
+ AEV_TRACE("sao_merge_flag", sao_merge_left_flag, ps_cabac->u4_range);
+ }
+
+ }
+ if(ry > 0 && !sao_merge_left_flag)
+ {
+ if((ps_codec->s_parse.i4_ctb_slice_y > 0) && (ps_codec->s_parse.i4_ctb_tile_y > 0))
+ {
+ TRACE_CABAC_CTXT("sao_merge_flag", ps_cabac->u4_range, ctxt_idx);
+ sao_merge_up_flag = ihevcd_cabac_decode_bin(ps_cabac,
+ ps_bitstrm,
+ ctxt_idx);
+ AEV_TRACE("sao_merge_flag", sao_merge_up_flag, ps_cabac->u4_range);
+ }
+ }
+ ctxt_idx = IHEVC_CAB_SAO_TYPE;
+
+ if(sao_merge_left_flag)
+ {
+ *ps_sao = *(ps_sao - 1);
+ }
+ else if(sao_merge_up_flag)
+ {
+ *ps_sao = *(ps_sao - ps_sps->i2_pic_wd_in_ctb);
+ }
+ else // if(!sao_merge_up_flag && !sao_merge_left_flag)
+ {
+ WORD32 c_idx;
+ WORD32 sao_type_idx = 0;
+ for(c_idx = 0; c_idx < 3; c_idx++)
+ {
+ if((ps_slice_hdr->i1_slice_sao_luma_flag && c_idx == 0) || (ps_slice_hdr->i1_slice_sao_chroma_flag && c_idx > 0))
+ {
+
+
+ /* sao_type_idx will be same for c_idx == 1 and c_idx == 2 - hence not initialized to zero for c_idx == 2*/
+
+ if(c_idx == 0)
+ {
+ sao_type_idx = 0;
+ TRACE_CABAC_CTXT("sao_type_idx", ps_cabac->u4_range, ctxt_idx);
+ sao_type_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+
+ if(sao_type_idx)
+ {
+ sao_type_idx += ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+ }
+ AEV_TRACE("sao_type_idx", sao_type_idx, ps_cabac->u4_range);
+
+ ps_sao->b3_y_type_idx = sao_type_idx;
+ }
+ if(c_idx == 1)
+ {
+ sao_type_idx = 0;
+ TRACE_CABAC_CTXT("sao_type_idx", ps_cabac->u4_range, ctxt_idx);
+ sao_type_idx = ihevcd_cabac_decode_bin(ps_cabac, ps_bitstrm, ctxt_idx);
+ if(sao_type_idx)
+ {
+ sao_type_idx += ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+ }
+
+ AEV_TRACE("sao_type_idx", sao_type_idx, ps_cabac->u4_range);
+
+ ps_sao->b3_cb_type_idx = sao_type_idx;
+ ps_sao->b3_cr_type_idx = sao_type_idx;
+ }
+
+ if(sao_type_idx != 0)
+ {
+ WORD32 i;
+ WORD32 sao_offset[4];
+ WORD32 sao_band_position = 0;
+ WORD32 c_max = (1 << (MIN(BIT_DEPTH, 10) - 5)) - 1;
+ for(i = 0; i < 4; i++)
+ {
+ sao_offset[i] = ihevcd_cabac_decode_bypass_bins_tunary(ps_cabac, ps_bitstrm, c_max);
+ AEV_TRACE("sao_offset_abs", sao_offset[i], ps_cabac->u4_range);
+
+ if((2 == sao_type_idx) && (i > 1))
+ {
+ sao_offset[i] = -sao_offset[i];
+ }
+ }
+
+ if(sao_type_idx == 1)
+ {
+ for(i = 0; i < 4; i++)
+ {
+ if(sao_offset[i] != 0)
+ {
+ value = ihevcd_cabac_decode_bypass_bin(ps_cabac, ps_bitstrm);
+ AEV_TRACE("sao_offset_sign", value, ps_cabac->u4_range);
+
+ if(value)
+ {
+ sao_offset[i] = -sao_offset[i];
+ }
+ }
+ }
+ value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 5);
+ AEV_TRACE("sao_band_position", value, ps_cabac->u4_range);
+
+ sao_band_position = value;
+ }
+ else
+ {
+ if(c_idx == 0)
+ {
+ value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 2);
+ AEV_TRACE("sao_eo_class", value, ps_cabac->u4_range);
+
+ ps_sao->b3_y_type_idx += value;
+ }
+
+ if(c_idx == 1)
+ {
+ value = ihevcd_cabac_decode_bypass_bins(ps_cabac, ps_bitstrm, 2);
+ AEV_TRACE("sao_eo_class", value, ps_cabac->u4_range);
+
+ ps_sao->b3_cb_type_idx += value;
+ ps_sao->b3_cr_type_idx += value;
+ }
+ }
+
+ if(0 == c_idx)
+ {
+ ps_sao->b4_y_offset_1 = sao_offset[0];
+ ps_sao->b4_y_offset_2 = sao_offset[1];
+ ps_sao->b4_y_offset_3 = sao_offset[2];
+ ps_sao->b4_y_offset_4 = sao_offset[3];
+
+ ps_sao->b5_y_band_pos = sao_band_position;
+ }
+ else if(1 == c_idx)
+ {
+ ps_sao->b4_cb_offset_1 = sao_offset[0];
+ ps_sao->b4_cb_offset_2 = sao_offset[1];
+ ps_sao->b4_cb_offset_3 = sao_offset[2];
+ ps_sao->b4_cb_offset_4 = sao_offset[3];
+
+ ps_sao->b5_cb_band_pos = sao_band_position;
+ }
+ else // 2 == c_idx
+ {
+ ps_sao->b4_cr_offset_1 = sao_offset[0];
+ ps_sao->b4_cr_offset_2 = sao_offset[1];
+ ps_sao->b4_cr_offset_3 = sao_offset[2];
+ ps_sao->b4_cr_offset_4 = sao_offset[3];
+
+ ps_sao->b5_cr_band_pos = sao_band_position;
+ }
+ }
+ }
+ }
+ }
+
+ return ret;
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * Parses Slice data syntax
+ *
+ * @par Description:
+ * Parses Slice data syntax as per Section:7.3.9.1
+ *
+ * @param[in] ps_codec
+ * Pointer to codec context
+ *
+ * @returns Error from IHEVCD_ERROR_T
+ *
+ * @remarks
+ *
+ *
+ *******************************************************************************
+ */
+IHEVCD_ERROR_T ihevcd_parse_slice_data(codec_t *ps_codec)
+{
+
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 end_of_slice_flag;
+ sps_t *ps_sps;
+ pps_t *ps_pps;
+ slice_header_t *ps_slice_hdr;
+ WORD32 end_of_pic;
+ tile_t *ps_tile, *ps_tile_prev;
+ WORD32 i;
+ WORD32 ctb_addr;
+ WORD32 tile_idx;
+ WORD32 cabac_init_idc;
+ WORD32 ctb_size;
+ WORD32 num_ctb_in_row;
+ WORD32 num_min4x4_in_ctb;
+ WORD32 slice_qp;
+ WORD32 slice_start_ctb_idx;
+ WORD32 tile_start_ctb_idx;
+
+#ifdef GPU_BUILD
+ WORD32 total_ctb_cnt = 0;
+ proc_job_t s_job;
+ gpu_ctxt_t *ps_gpu = &ps_codec->s_gpu_ctxt;
+#endif
+
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base;
+ ps_pps = ps_codec->s_parse.ps_pps_base;
+ ps_sps = ps_codec->s_parse.ps_sps_base;
+
+ /* Get current slice header, pps and sps */
+ ps_slice_hdr += (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+ ps_pps += ps_slice_hdr->i1_pps_id;
+ ps_sps += ps_pps->i1_sps_id;
+
+ if(0 != ps_codec->s_parse.i4_cur_slice_idx)
+ {
+ if(!ps_slice_hdr->i1_dependent_slice_flag)
+ {
+ ps_codec->s_parse.i4_cur_independent_slice_idx++;
+ if(MAX_SLICE_HDR_CNT == ps_codec->s_parse.i4_cur_independent_slice_idx)
+ ps_codec->s_parse.i4_cur_independent_slice_idx = 0;
+ }
+ }
+
+
+ ctb_size = 1 << ps_sps->i1_log2_ctb_size;
+ num_min4x4_in_ctb = (ctb_size / 4) * (ctb_size / 4);
+ num_ctb_in_row = ps_sps->i2_pic_wd_in_ctb;
+
+ /* Update the parse context */
+ if(0 == ps_codec->i4_slice_error)
+ {
+ ps_codec->s_parse.i4_ctb_x = ps_slice_hdr->i2_ctb_x;
+ ps_codec->s_parse.i4_ctb_y = ps_slice_hdr->i2_ctb_y;
+ }
+ ps_codec->s_parse.ps_pps = ps_pps;
+ ps_codec->s_parse.ps_sps = ps_sps;
+ ps_codec->s_parse.ps_slice_hdr = ps_slice_hdr;
+
+ /* Derive Tile positions for the current CTB */
+ /* Change this to lookup if required */
+ ihevcd_get_tile_pos(ps_pps, ps_sps, ps_codec->s_parse.i4_ctb_x,
+ ps_codec->s_parse.i4_ctb_y,
+ &ps_codec->s_parse.i4_ctb_tile_x,
+ &ps_codec->s_parse.i4_ctb_tile_y,
+ &tile_idx);
+ ps_codec->s_parse.ps_tile = ps_pps->ps_tile + tile_idx;
+ ps_codec->s_parse.i4_cur_tile_idx = tile_idx;
+ ps_tile = ps_codec->s_parse.ps_tile;
+ if(tile_idx)
+ ps_tile_prev = ps_tile - 1;
+ else
+ ps_tile_prev = ps_tile;
+
+ /* If the present slice is dependent, then store the previous
+ * independent slices' ctb x and y values for decoding process */
+ if(0 == ps_codec->i4_slice_error)
+ {
+ if(1 == ps_slice_hdr->i1_dependent_slice_flag)
+ {
+ /*If slice is present at the start of a new tile*/
+ if((0 == ps_codec->s_parse.i4_ctb_tile_x) && (0 == ps_codec->s_parse.i4_ctb_tile_y))
+ {
+ ps_codec->s_parse.i4_ctb_slice_x = 0;
+ ps_codec->s_parse.i4_ctb_slice_y = 0;
+ }
+ }
+
+ if(!ps_slice_hdr->i1_dependent_slice_flag)
+ {
+ ps_codec->s_parse.i4_ctb_slice_x = 0;
+ ps_codec->s_parse.i4_ctb_slice_y = 0;
+ }
+ }
+
+ /* Frame level initializations */
+ if((0 == ps_codec->s_parse.i4_ctb_y) &&
+ (0 == ps_codec->s_parse.i4_ctb_x))
+ {
+ ret = ihevcd_parse_pic_init(ps_codec);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ ps_codec->s_parse.pu4_pic_tu_idx[0] = 0;
+ ps_codec->s_parse.pu4_pic_pu_idx[0] = 0;
+ ps_codec->s_parse.i4_cur_independent_slice_idx = 0;
+ ps_codec->s_parse.i4_ctb_tile_x = 0;
+ ps_codec->s_parse.i4_ctb_tile_y = 0;
+ }
+
+ {
+ /* Updating the poc list of current slice to ps_mv_buf */
+ mv_buf_t *ps_mv_buf = ps_codec->s_parse.ps_cur_mv_buf;
+
+ if(ps_slice_hdr->i1_num_ref_idx_l1_active != 0)
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ ps_mv_buf->l1_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->i4_abs_poc;
+ ps_mv_buf->u1_l1_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_pic_buf)->u1_used_as_ref;
+ }
+ }
+
+ if(ps_slice_hdr->i1_num_ref_idx_l0_active != 0)
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ ps_mv_buf->l0_collocated_poc[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->i4_abs_poc;
+ ps_mv_buf->u1_l0_collocated_poc_lt[(ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1))][i] = ((pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_pic_buf)->u1_used_as_ref;
+ }
+ }
+ }
+
+ /*Initialize the low delay flag at the beginning of every slice*/
+ if((0 == ps_codec->s_parse.i4_ctb_slice_x) || (0 == ps_codec->s_parse.i4_ctb_slice_y))
+ {
+ /* Lowdelay flag */
+ WORD32 cur_poc, ref_list_poc, flag = 1;
+ cur_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ ref_list_poc = ((mv_buf_t *)ps_slice_hdr->as_ref_pic_list0[i].pv_mv_buf)->i4_abs_poc;
+ if(ref_list_poc > cur_poc)
+ {
+ flag = 0;
+ break;
+ }
+ }
+ if(flag && (ps_slice_hdr->i1_slice_type == BSLICE))
+ {
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ ref_list_poc = ((mv_buf_t *)ps_slice_hdr->as_ref_pic_list1[i].pv_mv_buf)->i4_abs_poc;
+ if(ref_list_poc > cur_poc)
+ {
+ flag = 0;
+ break;
+ }
+ }
+ }
+ ps_slice_hdr->i1_low_delay_flag = flag;
+ }
+
+ /* initialize the cabac init idc based on slice type */
+ if(ps_slice_hdr->i1_slice_type == ISLICE)
+ {
+ cabac_init_idc = 0;
+ }
+ else if(ps_slice_hdr->i1_slice_type == PSLICE)
+ {
+ cabac_init_idc = ps_slice_hdr->i1_cabac_init_flag ? 2 : 1;
+ }
+ else
+ {
+ cabac_init_idc = ps_slice_hdr->i1_cabac_init_flag ? 1 : 2;
+ }
+
+ slice_qp = ps_slice_hdr->i1_slice_qp_delta + ps_pps->i1_pic_init_qp;
+ slice_qp = CLIP3(slice_qp, 0, 51);
+
+ /*Update QP value for every indepndent slice or for every dependent slice that begins at the start of a new tile*/
+ if((0 == ps_slice_hdr->i1_dependent_slice_flag) ||
+ ((1 == ps_slice_hdr->i1_dependent_slice_flag) && ((0 == ps_codec->s_parse.i4_ctb_tile_x) && (0 == ps_codec->s_parse.i4_ctb_tile_y))))
+ {
+ ps_codec->s_parse.u4_qp = slice_qp;
+ }
+
+ /*Cabac init at the beginning of a slice*/
+ //If the slice is a dependent slice, not present at the start of a tile
+ if((1 == ps_slice_hdr->i1_dependent_slice_flag) && (!((ps_codec->s_parse.i4_ctb_tile_x == 0) && (ps_codec->s_parse.i4_ctb_tile_y == 0))))
+ {
+ if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
+ {
+ ihevcd_cabac_reset(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm);
+ }
+ }
+ else if((0 == ps_pps->i1_entropy_coding_sync_enabled_flag) || (ps_pps->i1_entropy_coding_sync_enabled_flag && (0 != ps_codec->s_parse.i4_ctb_x)))
+ {
+ ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm,
+ slice_qp,
+ cabac_init_idc,
+ &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
+ }
+
+
+ do
+ {
+
+ {
+ WORD32 cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+ + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ if(1 == ps_codec->i4_num_cores && 0 == cur_ctb_idx % RESET_TU_BUF_NCTB)
+ {
+ ps_codec->s_parse.ps_tu = ps_codec->s_parse.ps_pic_tu;
+ ps_codec->s_parse.i4_pic_tu_idx = 0;
+ }
+ }
+
+ end_of_pic = 0;
+ /* Section:7.3.7 Coding tree unit syntax */
+ /* coding_tree_unit() inlined here */
+ /* If number of cores is greater than 1, then add job to the queue */
+ //TODO: Dual core implementation might need a different algo for better load balancing
+ /* At the start of ctb row parsing in a tile, queue a job for processing the current tile row */
+ ps_codec->s_parse.i4_ctb_num_pcm_blks = 0;
+
+
+ /*At the beginning of each tile-which is not the beginning of a slice, cabac context must be initialized.
+ * Hence, check for the tile beginning here */
+ if(((0 == ps_codec->s_parse.i4_ctb_tile_x) && (0 == ps_codec->s_parse.i4_ctb_tile_y))
+ && (!((ps_tile->u1_pos_x == 0) && (ps_tile->u1_pos_y == 0)))
+ && (!((0 == ps_codec->s_parse.i4_ctb_slice_x) && (0 == ps_codec->s_parse.i4_ctb_slice_y))))
+ {
+ slice_qp = ps_slice_hdr->i1_slice_qp_delta + ps_pps->i1_pic_init_qp;
+ slice_qp = CLIP3(slice_qp, 0, 51);
+ ps_codec->s_parse.u4_qp = slice_qp;
+
+ ihevcd_get_tile_pos(ps_pps, ps_sps, ps_codec->s_parse.i4_ctb_x,
+ ps_codec->s_parse.i4_ctb_y,
+ &ps_codec->s_parse.i4_ctb_tile_x,
+ &ps_codec->s_parse.i4_ctb_tile_y,
+ &tile_idx);
+
+ ps_codec->s_parse.ps_tile = ps_pps->ps_tile + tile_idx;
+ ps_codec->s_parse.i4_cur_tile_idx = tile_idx;
+ ps_tile_prev = ps_tile - 1;
+
+ tile_start_ctb_idx = ps_tile->u1_pos_x
+ + ps_tile->u1_pos_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x
+ + ps_slice_hdr->i2_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ /*For slices that span across multiple tiles*/
+ if(slice_start_ctb_idx < tile_start_ctb_idx)
+ { /* 2 Cases
+ * 1 - slice spans across frame-width- but does not start from 1st column
+ * 2 - Slice spans across multiple tiles anywhere is a frame
+ */
+ ps_codec->s_parse.i4_ctb_slice_y = ps_tile->u1_pos_y - ps_slice_hdr->i2_ctb_y;
+ if(!(((ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd) % ps_sps->i2_pic_wd_in_ctb) == ps_tile->u1_pos_x)) //Case 2
+ {
+ if(ps_slice_hdr->i2_ctb_y <= ps_tile->u1_pos_y)
+ {
+ //Check if ctb x is before or after
+ if(ps_slice_hdr->i2_ctb_x > ps_tile->u1_pos_x)
+ {
+ ps_codec->s_parse.i4_ctb_slice_y -= 1;
+ }
+ }
+ }
+ /*ps_codec->s_parse.i4_ctb_slice_y = ps_tile->u1_pos_y - ps_slice_hdr->i2_ctb_y;
+ if (ps_slice_hdr->i2_ctb_y <= ps_tile->u1_pos_y)
+ {
+ //Check if ctb x is before or after
+ if (ps_slice_hdr->i2_ctb_x > ps_tile->u1_pos_x )
+ {
+ ps_codec->s_parse.i4_ctb_slice_y -= 1 ;
+ }
+ }*/
+ }
+
+ if(!ps_slice_hdr->i1_dependent_slice_flag)
+ {
+ ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm,
+ slice_qp,
+ cabac_init_idc,
+ &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
+
+ }
+ }
+ /* If number of cores is greater than 1, then add job to the queue */
+ //TODO: Dual core implementation might need a different algo for better load balancing
+ /* At the start of ctb row parsing in a tile, queue a job for processing the current tile row */
+
+ if(0 == ps_codec->s_parse.i4_ctb_tile_x)
+ {
+
+#ifndef GPU_BUILD
+ if(1 < ps_codec->i4_num_cores)
+ {
+ proc_job_t s_job;
+ IHEVCD_ERROR_T ret;
+ s_job.i4_cmd = CMD_PROCESS;
+ s_job.i2_ctb_cnt = (WORD16)ps_tile->u2_wd;
+ s_job.i2_ctb_x = (WORD16)ps_codec->s_parse.i4_ctb_x;
+ s_job.i2_ctb_y = (WORD16)ps_codec->s_parse.i4_ctb_y;
+ s_job.i2_slice_idx = (WORD16)ps_codec->s_parse.i4_cur_slice_idx;
+ s_job.i4_tu_coeff_data_ofst = (UWORD8 *)ps_codec->s_parse.pv_tu_coeff_data -
+ (UWORD8 *)ps_codec->s_parse.pv_pic_tu_coeff_data;
+ ret = ihevcd_jobq_queue((jobq_t *)ps_codec->s_parse.pv_proc_jobq, &s_job, sizeof(proc_job_t), 1);
+
+ if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ return ret;
+ }
+ else
+#endif
+ {
+#ifdef GPU_BUILD
+ process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+#else
+ process_ctxt_t *ps_proc = &ps_codec->as_process[0];
+#endif
+ WORD32 tu_coeff_data_ofst = (UWORD8 *)ps_codec->s_parse.pv_tu_coeff_data -
+ (UWORD8 *)ps_codec->s_parse.pv_pic_tu_coeff_data;
+
+ /* If the codec is running in single core mode,
+ * initialize zeroth process context
+ * TODO: Dual core mode might need a different implementation instead of jobq
+ */
+
+ ps_proc->i4_ctb_cnt = ps_tile->u2_wd;
+ ps_proc->i4_ctb_x = ps_codec->s_parse.i4_ctb_x;
+ ps_proc->i4_ctb_y = ps_codec->s_parse.i4_ctb_y;
+ ps_proc->i4_cur_slice_idx = ps_codec->s_parse.i4_cur_slice_idx;
+
+#ifdef GPU_BUILD
+ ps_proc->ps_slice_hdr = ps_slice_hdr;
+ ps_gpu->ai4_tu_coeff_data_ofst[ps_codec->s_parse.i4_ctb_tile_y] = tu_coeff_data_ofst;
+
+ ps_gpu->ai4_cur_slice_idx[ps_codec->s_parse.i4_ctb_tile_y] = ps_codec->s_parse.i4_cur_slice_idx;
+#endif
+ ihevcd_init_proc_ctxt(ps_proc, tu_coeff_data_ofst);
+ }
+ }
+
+
+ /* Restore cabac context model from top right CTB if entropy sync is enabled */
+ if(ps_pps->i1_entropy_coding_sync_enabled_flag)
+ {
+ /*TODO Handle single CTB and top-right belonging to a different slice */
+ if(0 == ps_codec->s_parse.i4_ctb_x)
+ {
+ //WORD32 size = sizeof(ps_codec->s_parse.s_cabac.au1_ctxt_models);
+ WORD32 default_ctxt = 0;
+
+ if((0 == ps_codec->s_parse.i4_ctb_slice_y) && (!ps_slice_hdr->i1_dependent_slice_flag))
+ default_ctxt = 1;
+ if(1 == ps_sps->i2_pic_wd_in_ctb)
+ default_ctxt = 1;
+
+ ps_codec->s_parse.u4_qp = slice_qp;
+ if(default_ctxt)
+ {
+ //memcpy(&ps_codec->s_parse.s_cabac.au1_ctxt_models, &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0], size);
+ ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm,
+ slice_qp,
+ cabac_init_idc,
+ &gau1_ihevc_cab_ctxts[cabac_init_idc][slice_qp][0]);
+
+ }
+ else
+ {
+ //memcpy(&ps_codec->s_parse.s_cabac.au1_ctxt_models, &ps_codec->s_parse.s_cabac.au1_ctxt_models_sync, size);
+ ihevcd_cabac_init(&ps_codec->s_parse.s_cabac,
+ &ps_codec->s_parse.s_bitstrm,
+ slice_qp,
+ cabac_init_idc,
+ (const UWORD8 *)&ps_codec->s_parse.s_cabac.au1_ctxt_models_sync);
+
+ }
+ }
+ }
+
+
+
+ if(0 == ps_codec->i4_slice_error)
+ {
+ if(ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag)
+ ihevcd_parse_sao(ps_codec);
+ }
+ else
+ {
+ sao_t *ps_sao = ps_codec->s_parse.ps_pic_sao +
+ ps_codec->s_parse.i4_ctb_x +
+ ps_codec->s_parse.i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+
+ /* Default values */
+ ps_sao->b3_y_type_idx = 0;
+ ps_sao->b3_cb_type_idx = 0;
+ ps_sao->b3_cr_type_idx = 0;
+ }
+
+ //AEV_TRACE("CTB x", ps_codec->s_parse.i4_ctb_x, 0);
+ //AEV_TRACE("CTB y", ps_codec->s_parse.i4_ctb_y, 0);
+
+ {
+ WORD32 ctb_indx;
+ ctb_indx = ps_codec->s_parse.i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_codec->s_parse.i4_ctb_y;
+ ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb[ctb_indx >> 3] |= (1 << (ctb_indx & 7));
+ {
+ UWORD16 *pu1_slice_idx = ps_codec->s_parse.pu1_slice_idx;
+ pu1_slice_idx[ctb_indx] = ps_codec->s_parse.i4_cur_independent_slice_idx;
+ }
+ }
+
+ if(0 == ps_codec->i4_slice_error)
+ {
+ ihevcd_parse_coding_quadtree(ps_codec,
+ (ps_codec->s_parse.i4_ctb_x << ps_sps->i1_log2_ctb_size),
+ (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size),
+ ps_sps->i1_log2_ctb_size,
+ 0);
+ }
+ else
+ {
+ tu_t *ps_tu = ps_codec->s_parse.ps_tu;
+ pu_t *ps_pu = ps_codec->s_parse.ps_pu;
+
+ ps_tu->b1_cb_cbf = 0;
+ ps_tu->b1_cr_cbf = 0;
+ ps_tu->b1_y_cbf = 0;
+ ps_tu->b4_pos_x = 0;
+ ps_tu->b4_pos_y = 0;
+ ps_tu->b1_transquant_bypass = 0;
+ ps_tu->b3_size = (ps_sps->i1_log2_ctb_size - 2);
+ ps_tu->b7_qp = ps_codec->s_parse.u4_qp;
+ ps_tu->b3_chroma_intra_mode_idx = INTRA_PRED_CHROMA_IDX_NONE;
+ ps_tu->b6_luma_intra_mode = INTRA_PRED_NONE;
+ ps_tu->b1_first_tu_in_cu = 1;
+
+ ps_codec->s_parse.ps_tu++;
+ ps_codec->s_parse.s_cu.i4_tu_cnt++;
+ ps_codec->s_parse.i4_pic_tu_idx++;
+
+ ps_codec->s_parse.s_cu.i4_pred_mode = PRED_MODE_SKIP;
+ ps_codec->s_parse.s_cu.i4_part_mode = PART_2Nx2N;
+
+ ps_pu->b2_part_idx = 0;
+ ps_pu->b4_pos_x = 0;
+ ps_pu->b4_pos_y = 0;
+ ps_pu->b4_wd = (ctb_size >> 2) - 1;
+ ps_pu->b4_ht = (ctb_size >> 2) - 1;
+ ps_pu->b1_intra_flag = 0;
+ ps_pu->b3_part_mode = ps_codec->s_parse.s_cu.i4_part_mode;
+ ps_pu->b1_merge_flag = 1;
+ ps_pu->b3_merge_idx = 0;
+
+ ps_codec->s_parse.ps_pu++;
+ ps_codec->s_parse.i4_pic_pu_idx++;
+
+ }
+
+ if(0 == ps_codec->i4_slice_error)
+ end_of_slice_flag = ihevcd_cabac_decode_terminate(&ps_codec->s_parse.s_cabac, &ps_codec->s_parse.s_bitstrm);
+ else
+ end_of_slice_flag = 0;
+
+ AEV_TRACE("end_of_slice_flag", end_of_slice_flag, ps_codec->s_parse.s_cabac.u4_range);
+
+
+ /* In case of tiles or entropy sync, terminate cabac and copy cabac context backed up at the end of top-right CTB */
+ if(ps_pps->i1_tiles_enabled_flag || ps_pps->i1_entropy_coding_sync_enabled_flag)
+ {
+ WORD32 end_of_tile = 0;
+ WORD32 end_of_tile_row = 0;
+
+ /* Take a back up of cabac context models if entropy sync is enabled */
+ if(ps_pps->i1_entropy_coding_sync_enabled_flag || ps_pps->i1_tiles_enabled_flag)
+ {
+ if(1 == ps_codec->s_parse.i4_ctb_x)
+ {
+ WORD32 size = sizeof(ps_codec->s_parse.s_cabac.au1_ctxt_models);
+ memcpy(&ps_codec->s_parse.s_cabac.au1_ctxt_models_sync, &ps_codec->s_parse.s_cabac.au1_ctxt_models, size);
+ }
+ }
+
+ /* Since tiles and entropy sync are not enabled simultaneously, the following will not result in any problems */
+ if((ps_codec->s_parse.i4_ctb_tile_x + 1) == (ps_tile->u2_wd))
+ {
+ end_of_tile_row = 1;
+ if((ps_codec->s_parse.i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+ end_of_tile = 1;
+ }
+ if((0 == end_of_slice_flag) &&
+ ((ps_pps->i1_tiles_enabled_flag && end_of_tile) ||
+ (ps_pps->i1_entropy_coding_sync_enabled_flag && end_of_tile_row)))
+ {
+ WORD32 end_of_sub_stream_one_bit;
+ end_of_sub_stream_one_bit = ihevcd_cabac_decode_terminate(&ps_codec->s_parse.s_cabac, &ps_codec->s_parse.s_bitstrm);
+ AEV_TRACE("end_of_sub_stream_one_bit", end_of_sub_stream_one_bit, ps_codec->s_parse.s_cabac.u4_range);
+
+ /* TODO: Remove the check for offset when HM is updated to include a byte unconditionally even for aligned location */
+ /* For Ittiam streams this check should not be there, for HM9.1 streams this should be there */
+ if(ps_codec->s_parse.s_bitstrm.u4_bit_ofst % 8)
+ ihevcd_bits_flush_to_byte_boundary(&ps_codec->s_parse.s_bitstrm);
+
+ UNUSED(end_of_sub_stream_one_bit);
+ }
+ }
+ {
+ WORD32 ctb_indx;
+
+ ctb_addr = ps_codec->s_parse.i4_ctb_y * num_ctb_in_row + ps_codec->s_parse.i4_ctb_x;
+
+ ctb_indx = ++ctb_addr;
+
+ /* Store pu_idx for next CTB in frame level pu_idx array */
+
+ //In case of multiple tiles, if end-of-tile row is reached
+ if((ps_tile->u2_wd == (ps_codec->s_parse.i4_ctb_tile_x + 1)) && (ps_tile->u2_wd != ps_sps->i2_pic_wd_in_ctb))
+ {
+ ctb_indx = (ps_sps->i2_pic_wd_in_ctb * (ps_codec->s_parse.i4_ctb_tile_y + 1 + ps_tile->u1_pos_y)) + ps_tile->u1_pos_x; //idx is the beginning of next row in current tile.
+ if(ps_tile->u2_ht == (ps_codec->s_parse.i4_ctb_tile_y + 1))
+ {
+ //If the current ctb is the last tile's last ctb
+ if((ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb) && ((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb)))
+ {
+ ctb_indx = ctb_addr; //Next continuous ctb address
+ }
+ else //Not last tile's end , but a tile end
+ {
+ tile_t *ps_next_tile = ps_codec->s_parse.ps_tile + 1;
+ ctb_indx = ps_next_tile->u1_pos_x + (ps_next_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb); //idx is the beginning of first row in next tile.
+ }
+ }
+ }
+
+ ps_codec->s_parse.pu4_pic_pu_idx[ctb_indx] = ps_codec->s_parse.i4_pic_pu_idx;
+ ps_codec->s_parse.i4_next_pu_ctb_cnt = ctb_indx;
+
+ ps_codec->s_parse.pu1_pu_map += num_min4x4_in_ctb;
+
+ /* Store tu_idx for next CTB in frame level tu_idx array */
+ if(1 == ps_codec->i4_num_cores)
+ {
+ ctb_indx = (0 == ctb_addr % RESET_TU_BUF_NCTB) ?
+ RESET_TU_BUF_NCTB : ctb_addr % RESET_TU_BUF_NCTB;
+
+ //In case of multiple tiles, if end-of-tile row is reached
+ if((ps_tile->u2_wd == (ps_codec->s_parse.i4_ctb_tile_x + 1)) && (ps_tile->u2_wd != ps_sps->i2_pic_wd_in_ctb))
+ {
+ ctb_indx = (ps_sps->i2_pic_wd_in_ctb * (ps_codec->s_parse.i4_ctb_tile_y + 1 + ps_tile->u1_pos_y)) + ps_tile->u1_pos_x; //idx is the beginning of next row in current tile.
+ if(ps_tile->u2_ht == (ps_codec->s_parse.i4_ctb_tile_y + 1))
+ {
+ //If the current ctb is the last tile's last ctb
+ if((ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb) && ((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb)))
+ {
+ ctb_indx = (0 == ctb_addr % RESET_TU_BUF_NCTB) ?
+ RESET_TU_BUF_NCTB : ctb_addr % RESET_TU_BUF_NCTB;
+ }
+ else //Not last tile's end , but a tile end
+ {
+ tile_t *ps_next_tile = ps_codec->s_parse.ps_tile + 1;
+ ctb_indx = ps_next_tile->u1_pos_x + (ps_next_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb); //idx is the beginning of first row in next tile.
+ }
+ }
+ }
+ ps_codec->s_parse.i4_next_tu_ctb_cnt = ctb_indx;
+ ps_codec->s_parse.pu4_pic_tu_idx[ctb_indx] = ps_codec->s_parse.i4_pic_tu_idx;
+ }
+ else
+ {
+ ctb_indx = ctb_addr;
+ if((ps_tile->u2_wd == (ps_codec->s_parse.i4_ctb_tile_x + 1)) && (ps_tile->u2_wd != ps_sps->i2_pic_wd_in_ctb))
+ {
+ ctb_indx = (ps_sps->i2_pic_wd_in_ctb * (ps_codec->s_parse.i4_ctb_tile_y + 1 + ps_tile->u1_pos_y)) + ps_tile->u1_pos_x; //idx is the beginning of next row in current tile.
+ if(ps_tile->u2_ht == (ps_codec->s_parse.i4_ctb_tile_y + 1))
+ {
+ //If the current ctb is the last tile's last ctb
+ if((ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb) && ((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb)))
+ {
+ ctb_indx = ctb_addr;
+ }
+ else //Not last tile's end , but a tile end
+ {
+ tile_t *ps_next_tile = ps_codec->s_parse.ps_tile + 1;
+ ctb_indx = ps_next_tile->u1_pos_x + (ps_next_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb); //idx is the beginning of first row in next tile.
+ }
+ }
+ }
+ ps_codec->s_parse.i4_next_tu_ctb_cnt = ctb_indx;
+ ps_codec->s_parse.pu4_pic_tu_idx[ctb_indx] = ps_codec->s_parse.i4_pic_tu_idx;
+ }
+ ps_codec->s_parse.pu1_tu_map += num_min4x4_in_ctb;
+ }
+
+
+ if(ps_codec->i4_num_cores <= MV_PRED_NUM_CORES_THRESHOLD)
+ {
+ /*************************************************/
+ /**************** MV pred **********************/
+ /*************************************************/
+ WORD8 u1_top_ctb_avail = 1;
+ WORD8 u1_left_ctb_avail = 1;
+ WORD8 u1_top_lt_ctb_avail = 1;
+ WORD8 u1_top_rt_ctb_avail = 1;
+ WORD16 i2_wd_in_ctb;
+
+ tile_start_ctb_idx = ps_tile->u1_pos_x
+ + ps_tile->u1_pos_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x
+ + ps_slice_hdr->i2_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ if((slice_start_ctb_idx < tile_start_ctb_idx))
+ {
+ //Slices span across multiple tiles.
+ i2_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
+ }
+ else
+ {
+ i2_wd_in_ctb = ps_tile->u2_wd;
+ }
+ /* slice and tile boundaries */
+ if((0 == ps_codec->s_parse.i4_ctb_y) || (0 == ps_codec->s_parse.i4_ctb_tile_y))
+ {
+ u1_top_ctb_avail = 0;
+ u1_top_lt_ctb_avail = 0;
+ u1_top_rt_ctb_avail = 0;
+ }
+
+ if((0 == ps_codec->s_parse.i4_ctb_x) || (0 == ps_codec->s_parse.i4_ctb_tile_x))
+ {
+ u1_left_ctb_avail = 0;
+ u1_top_lt_ctb_avail = 0;
+ if((0 == ps_codec->s_parse.i4_ctb_slice_y) || (0 == ps_codec->s_parse.i4_ctb_tile_y))
+ {
+ u1_top_ctb_avail = 0;
+ if((i2_wd_in_ctb - 1) != ps_codec->s_parse.i4_ctb_slice_x) //TODO: For tile, not implemented
+ {
+ u1_top_rt_ctb_avail = 0;
+ }
+ }
+ }
+ /*For slices not beginning at start of a ctb row*/
+ else if(ps_codec->s_parse.i4_ctb_x > 0)
+ {
+ if((0 == ps_codec->s_parse.i4_ctb_slice_y) || (0 == ps_codec->s_parse.i4_ctb_tile_y))
+ {
+ u1_top_ctb_avail = 0;
+ u1_top_lt_ctb_avail = 0;
+ if(0 == ps_codec->s_parse.i4_ctb_slice_x)
+ {
+ u1_left_ctb_avail = 0;
+ }
+ if((i2_wd_in_ctb - 1) != ps_codec->s_parse.i4_ctb_slice_x)
+ {
+ u1_top_rt_ctb_avail = 0;
+ }
+ }
+ else if((1 == ps_codec->s_parse.i4_ctb_slice_y) && (0 == ps_codec->s_parse.i4_ctb_slice_x))
+ {
+ u1_top_lt_ctb_avail = 0;
+ }
+ }
+
+ if(((ps_sps->i2_pic_wd_in_ctb - 1) == ps_codec->s_parse.i4_ctb_x) || ((ps_tile->u2_wd - 1) == ps_codec->s_parse.i4_ctb_tile_x))
+ {
+ u1_top_rt_ctb_avail = 0;
+ }
+
+ if(PSLICE == ps_slice_hdr->i1_slice_type
+ || BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ mv_ctxt_t s_mv_ctxt;
+ process_ctxt_t *ps_proc;
+ UWORD32 *pu4_ctb_top_pu_idx;
+ UWORD32 *pu4_ctb_left_pu_idx;
+ UWORD32 *pu4_ctb_top_left_pu_idx;
+ WORD32 i4_ctb_pu_cnt;
+ WORD32 cur_ctb_idx;
+ WORD32 next_ctb_idx;
+ WORD32 cur_pu_idx;
+ ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+ cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+ + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ next_ctb_idx = ps_codec->s_parse.i4_next_pu_ctb_cnt;
+ i4_ctb_pu_cnt = ps_codec->s_parse.pu4_pic_pu_idx[next_ctb_idx]
+ - ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+
+ cur_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+
+ pu4_ctb_top_pu_idx = ps_proc->pu4_pic_pu_idx_top
+ + (ps_codec->s_parse.i4_ctb_x * ctb_size / MIN_PU_SIZE);
+ pu4_ctb_left_pu_idx = ps_proc->pu4_pic_pu_idx_left;
+ pu4_ctb_top_left_pu_idx = &ps_proc->u4_ctb_top_left_pu_idx;
+
+ /* Initializing s_mv_ctxt */
+ {
+ s_mv_ctxt.ps_pps = ps_pps;
+ s_mv_ctxt.ps_sps = ps_sps;
+ s_mv_ctxt.ps_slice_hdr = ps_slice_hdr;
+ s_mv_ctxt.i4_ctb_x = ps_codec->s_parse.i4_ctb_x;
+ s_mv_ctxt.i4_ctb_y = ps_codec->s_parse.i4_ctb_y;
+ s_mv_ctxt.ps_pu = &ps_codec->s_parse.ps_pic_pu[cur_pu_idx];
+ s_mv_ctxt.ps_pic_pu = ps_codec->s_parse.ps_pic_pu;
+ s_mv_ctxt.ps_tile = ps_tile;
+ s_mv_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+ s_mv_ctxt.pu4_pic_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx;
+ s_mv_ctxt.pu1_pic_pu_map = ps_codec->s_parse.pu1_pic_pu_map;
+ s_mv_ctxt.i4_ctb_pu_cnt = i4_ctb_pu_cnt;
+ s_mv_ctxt.i4_ctb_start_pu_idx = cur_pu_idx;
+ s_mv_ctxt.u1_top_ctb_avail = u1_top_ctb_avail;
+ s_mv_ctxt.u1_top_rt_ctb_avail = u1_top_rt_ctb_avail;
+ s_mv_ctxt.u1_top_lt_ctb_avail = u1_top_lt_ctb_avail;
+ s_mv_ctxt.u1_left_ctb_avail = u1_left_ctb_avail;
+ }
+
+ ihevcd_get_mv_ctb(&s_mv_ctxt, pu4_ctb_top_pu_idx,
+ pu4_ctb_left_pu_idx, pu4_ctb_top_left_pu_idx);
+
+ }
+ else
+ {
+ WORD32 num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+ UWORD8 *pu1_pic_pu_map_ctb = ps_codec->s_parse.pu1_pic_pu_map +
+ (ps_codec->s_parse.i4_ctb_x + ps_codec->s_parse.i4_ctb_y * ps_sps->i2_pic_wd_in_ctb) * num_minpu_in_ctb;
+ process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+ WORD32 row, col;
+ WORD32 pu_cnt;
+ WORD32 num_pu_per_ctb;
+ WORD32 cur_ctb_idx;
+ WORD32 next_ctb_idx;
+ WORD32 ctb_start_pu_idx;
+ UWORD32 *pu4_nbr_pu_idx = ps_proc->pu4_pic_pu_idx_map;
+ WORD32 nbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+ pu_t *ps_pu;
+
+ for(row = 0; row < ctb_size / MIN_PU_SIZE; row++)
+ {
+ for(col = 0; col < ctb_size / MIN_PU_SIZE; col++)
+ {
+ pu1_pic_pu_map_ctb[row * ctb_size / MIN_PU_SIZE + col] = 0;
+ }
+ }
+
+
+ /* Neighbor PU idx update inside CTB */
+ /* 1byte per 4x4. Indicates the PU idx that 4x4 block belongs to */
+
+ cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+ + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ next_ctb_idx = ps_codec->s_parse.i4_next_pu_ctb_cnt;
+ num_pu_per_ctb = ps_codec->s_parse.pu4_pic_pu_idx[next_ctb_idx]
+ - ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+ ctb_start_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+ ps_pu = &ps_codec->s_parse.ps_pic_pu[ctb_start_pu_idx];
+
+ for(pu_cnt = 0; pu_cnt < num_pu_per_ctb; pu_cnt++, ps_pu++)
+ {
+ UWORD32 cur_pu_idx;
+ WORD32 pu_ht = (ps_pu->b4_ht + 1) << 2;
+ WORD32 pu_wd = (ps_pu->b4_wd + 1) << 2;
+
+ cur_pu_idx = ctb_start_pu_idx + pu_cnt;
+
+ for(row = 0; row < pu_ht / MIN_PU_SIZE; row++)
+ for(col = 0; col < pu_wd / MIN_PU_SIZE; col++)
+ pu4_nbr_pu_idx[(1 + ps_pu->b4_pos_x + col)
+ + (1 + ps_pu->b4_pos_y + row)
+ * nbr_pu_idx_strd] =
+ cur_pu_idx;
+ }
+
+ /* Updating Top and Left pointers */
+ {
+ WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+ - (ps_codec->s_parse.i4_ctb_y << ps_sps->i1_log2_ctb_size);
+ WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+
+ /* Top Left */
+ /* saving top left before updating top ptr, as updating top ptr will overwrite the top left for the next ctb */
+ ps_proc->u4_ctb_top_left_pu_idx = ps_proc->pu4_pic_pu_idx_top[(ps_codec->s_parse.i4_ctb_x * ctb_size / MIN_PU_SIZE) + ctb_size / MIN_PU_SIZE - 1];
+ for(i = 0; i < ctb_size / MIN_PU_SIZE; i++)
+ {
+ /* Left */
+ /* Last column of au4_nbr_pu_idx */
+ ps_proc->pu4_pic_pu_idx_left[i] = pu4_nbr_pu_idx[(ctb_size / MIN_PU_SIZE)
+ + (i + 1) * nbr_pu_idx_strd];
+ /* Top */
+ /* Last row of au4_nbr_pu_idx */
+ ps_proc->pu4_pic_pu_idx_top[(ps_codec->s_parse.i4_ctb_x * ctb_size / MIN_PU_SIZE) + i] =
+ pu4_nbr_pu_idx[(ctb_size_left / MIN_PU_SIZE) * nbr_pu_idx_strd + i + 1];
+
+ }
+ }
+ }
+
+ /*************************************************/
+ /****************** BS, QP *********************/
+ /*************************************************/
+ /* Check if deblock is disabled for the current slice or if it is disabled for the current picture
+ * because of disable deblock api
+ */
+ if(0 == ps_codec->i4_disable_deblk_pic)
+ {
+ if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
+ (0 == ps_codec->i4_slice_error))
+ {
+ WORD32 i4_ctb_tu_cnt;
+ WORD32 cur_ctb_idx, next_ctb_idx;
+ WORD32 cur_pu_idx;
+ WORD32 cur_tu_idx;
+ process_ctxt_t *ps_proc;
+
+ ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+ cur_ctb_idx = ps_codec->s_parse.i4_ctb_x
+ + ps_codec->s_parse.i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ cur_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx[cur_ctb_idx];
+ next_ctb_idx = ps_codec->s_parse.i4_next_tu_ctb_cnt;
+ if(1 == ps_codec->i4_num_cores)
+ {
+ i4_ctb_tu_cnt = ps_codec->s_parse.pu4_pic_tu_idx[next_ctb_idx] -
+ ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+
+ cur_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+ }
+ else
+ {
+ i4_ctb_tu_cnt = ps_codec->s_parse.pu4_pic_tu_idx[next_ctb_idx] -
+ ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx];
+
+ cur_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx[cur_ctb_idx];
+ }
+
+ ps_codec->s_parse.s_bs_ctxt.ps_pps = ps_codec->s_parse.ps_pps;
+ ps_codec->s_parse.s_bs_ctxt.ps_sps = ps_codec->s_parse.ps_sps;
+ ps_codec->s_parse.s_bs_ctxt.ps_codec = ps_codec;
+ ps_codec->s_parse.s_bs_ctxt.i4_ctb_tu_cnt = i4_ctb_tu_cnt;
+ ps_codec->s_parse.s_bs_ctxt.i4_ctb_x = ps_codec->s_parse.i4_ctb_x;
+ ps_codec->s_parse.s_bs_ctxt.i4_ctb_y = ps_codec->s_parse.i4_ctb_y;
+ ps_codec->s_parse.s_bs_ctxt.i4_ctb_tile_x = ps_codec->s_parse.i4_ctb_tile_x;
+ ps_codec->s_parse.s_bs_ctxt.i4_ctb_tile_y = ps_codec->s_parse.i4_ctb_tile_y;
+ ps_codec->s_parse.s_bs_ctxt.i4_ctb_slice_x = ps_codec->s_parse.i4_ctb_slice_x;
+ ps_codec->s_parse.s_bs_ctxt.i4_ctb_slice_y = ps_codec->s_parse.i4_ctb_slice_y;
+ ps_codec->s_parse.s_bs_ctxt.ps_tu = &ps_codec->s_parse.ps_pic_tu[cur_tu_idx];
+ ps_codec->s_parse.s_bs_ctxt.ps_pu = &ps_codec->s_parse.ps_pic_pu[cur_pu_idx];
+ ps_codec->s_parse.s_bs_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+ ps_codec->s_parse.s_bs_ctxt.i4_next_pu_ctb_cnt = ps_codec->s_parse.i4_next_pu_ctb_cnt;
+ ps_codec->s_parse.s_bs_ctxt.i4_next_tu_ctb_cnt = ps_codec->s_parse.i4_next_tu_ctb_cnt;
+ ps_codec->s_parse.s_bs_ctxt.pu1_slice_idx = ps_codec->s_parse.pu1_slice_idx;
+ ps_codec->s_parse.s_bs_ctxt.ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+ ps_codec->s_parse.s_bs_ctxt.ps_tile = ps_codec->s_parse.ps_tile;
+
+ if(ISLICE == ps_slice_hdr->i1_slice_type)
+ {
+ ihevcd_ctb_boundary_strength_islice(&ps_codec->s_parse.s_bs_ctxt);
+ }
+ else
+ {
+ ihevcd_ctb_boundary_strength_pbslice(&ps_codec->s_parse.s_bs_ctxt);
+ }
+ }
+ else
+ {
+ WORD32 vert_bs_strd = ps_sps->i2_pic_wd_in_ctb * (ctb_size * ctb_size / 8 / 16);
+ WORD32 horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16);
+ UWORD32 *pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs +
+ ps_codec->s_parse.i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+ ps_codec->s_parse.i4_ctb_y * vert_bs_strd);
+ UWORD32 *pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs +
+ ps_codec->s_parse.i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+ ps_codec->s_parse.i4_ctb_y * horz_bs_strd);
+
+ memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2);
+ memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+
+ }
+ }
+
+ }
+
+
+ /* Update the parse status map */
+ {
+ sps_t *ps_sps = ps_codec->s_parse.ps_sps;
+ UWORD8 *pu1_buf;
+ WORD32 idx;
+ idx = (ps_codec->s_parse.i4_ctb_x);
+ idx += ((ps_codec->s_parse.i4_ctb_y) * ps_sps->i2_pic_wd_in_ctb);
+ pu1_buf = (ps_codec->pu1_parse_map + idx);
+ *pu1_buf = 1;
+ }
+
+ /* Increment CTB x and y positions */
+ ps_codec->s_parse.i4_ctb_tile_x++;
+ ps_codec->s_parse.i4_ctb_x++;
+ ps_codec->s_parse.i4_ctb_slice_x++;
+
+ /*If tiles are enabled, handle the slice counters differently*/
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ //Indicates multiple tiles in a slice case
+ tile_start_ctb_idx = ps_tile->u1_pos_x
+ + ps_tile->u1_pos_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x
+ + ps_slice_hdr->i2_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ if((slice_start_ctb_idx < tile_start_ctb_idx))
+ {
+ if(ps_codec->s_parse.i4_ctb_slice_x == (ps_tile->u1_pos_x + ps_tile->u2_wd))
+ {
+ /* Reached end of slice row within a tile /frame */
+ ps_codec->s_parse.i4_ctb_slice_y++;
+ ps_codec->s_parse.i4_ctb_slice_x = ps_tile->u1_pos_x; //todo:Check
+ }
+ }
+ //Indicates multiple slices in a tile case - hence, reset slice_x
+ else if(ps_codec->s_parse.i4_ctb_slice_x == (ps_tile->u2_wd))
+ {
+ ps_codec->s_parse.i4_ctb_slice_y++;
+ ps_codec->s_parse.i4_ctb_slice_x = 0;
+ }
+ }
+ else
+ {
+ if(ps_codec->s_parse.i4_ctb_slice_x == ps_tile->u2_wd)
+ {
+ /* Reached end of slice row within a tile /frame */
+ ps_codec->s_parse.i4_ctb_slice_y++;
+ ps_codec->s_parse.i4_ctb_slice_x = 0;
+ }
+ }
+
+
+ if(ps_codec->s_parse.i4_ctb_tile_x == (ps_tile->u2_wd))
+ {
+ /* Reached end of tile row */
+ ps_codec->s_parse.i4_ctb_tile_x = 0;
+ ps_codec->s_parse.i4_ctb_x = ps_tile->u1_pos_x;
+
+ ps_codec->s_parse.i4_ctb_tile_y++;
+ ps_codec->s_parse.i4_ctb_y++;
+
+ if(ps_codec->s_parse.i4_ctb_tile_y == (ps_tile->u2_ht))
+ {
+ /* Reached End of Tile */
+ ps_codec->s_parse.i4_ctb_tile_y = 0;
+ ps_codec->s_parse.i4_ctb_tile_x = 0;
+ ps_codec->s_parse.ps_tile++;
+
+ if((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb) && (ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb))
+ {
+ /* Reached end of frame */
+ end_of_pic = 1;
+ ps_codec->s_parse.i4_ctb_x = 0;
+ ps_codec->s_parse.i4_ctb_y = ps_sps->i2_pic_ht_in_ctb;
+ }
+ else
+ {
+ /* Initialize ctb_x and ctb_y to start of next tile */
+ ps_tile = ps_codec->s_parse.ps_tile;
+ ps_codec->s_parse.i4_ctb_x = ps_tile->u1_pos_x;
+ ps_codec->s_parse.i4_ctb_y = ps_tile->u1_pos_y;
+ ps_codec->s_parse.i4_ctb_tile_y = 0;
+ ps_codec->s_parse.i4_ctb_tile_x = 0;
+ ps_codec->s_parse.i4_ctb_slice_x = ps_tile->u1_pos_x;
+ ps_codec->s_parse.i4_ctb_slice_y = ps_tile->u1_pos_y;
+
+ }
+ }
+
+ }
+
+ ps_codec->s_parse.i4_next_ctb_indx = ps_codec->s_parse.i4_ctb_x +
+ ps_codec->s_parse.i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+
+ /* If the current slice is in error, check if the next slice's address
+ * is reached and mark the end_of_slice flag */
+ if(ps_codec->i4_slice_error)
+ {
+ slice_header_t *ps_slice_hdr_next = ps_slice_hdr + 1;
+ WORD32 next_slice_addr = ps_slice_hdr_next->i2_ctb_x +
+ ps_slice_hdr_next->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+
+ if(ps_codec->s_parse.i4_next_ctb_indx == next_slice_addr)
+ end_of_slice_flag = 1;
+ }
+
+#ifndef GPU_BUILD
+ /* If the codec is running in single core mode
+ * then call process function for current CTB
+ */
+ if((1 == ps_codec->i4_num_cores) && (ps_codec->s_parse.i4_ctb_tile_x == 0))
+ {
+ process_ctxt_t *ps_proc = &ps_codec->as_process[0];
+// ps_proc->i4_ctb_cnt = ihevcd_nctb_cnt(ps_codec, ps_sps);
+ ps_proc->i4_ctb_cnt = ps_proc->ps_tile->u2_wd;
+ ihevcd_process(ps_proc);
+ }
+#else
+ /* Now call the function that will popluated mc data for the
+ * current ctb.
+ */
+ if(ps_codec->u4_gpu_enabled) // == ps_codec->i4_num_cores)
+ {
+ process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+ WORD32 nctb_mc = 1;
+ WORD32 cur_ctb_idx;
+ WORD32 cur_pu_idx;
+ //ps_proc->i4_ctb_cnt = ihevcd_nctb_cnt(ps_codec, ps_sps);
+ //ihevcd_process(ps_proc);
+ cur_ctb_idx = ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ cur_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+ ps_proc->ps_pu = &ps_proc->ps_pic_pu[cur_pu_idx];
+ ps_proc->ps_slice_hdr = ps_slice_hdr;
+
+ if(ISLICE != ps_slice_hdr->i1_slice_type)
+ ihevcd_gpu_mc_populate_data_nctb(ps_proc, nctb_mc);
+
+ ps_proc->i4_ctb_x += nctb_mc;
+ ps_proc->i4_ctb_cnt -= nctb_mc;
+ ps_proc->i4_ctb_tile_x += nctb_mc;
+ }
+
+ total_ctb_cnt++;
+ ps_gpu->i4_curr_grain_ctb_cnt++;
+ if(1)
+ {
+
+ if(ps_gpu->i4_curr_grain_ctb_cnt == ps_gpu->ai4_ctbs_in_grain[ps_gpu->i4_curr_grain_idx])
+ {
+ process_ctxt_t *ps_proc = &ps_codec->as_process[(ps_codec->i4_num_cores == 1) ? 1 : (ps_codec->i4_num_cores - 1)];
+
+ if(ps_codec->u4_gpu_enabled)
+ ihevcd_gpu_mc_execute(ps_proc);
+
+
+#if 1
+ if(1 < ps_codec->i4_num_cores)
+ {
+ IHEVCD_ERROR_T ret;
+ WORD32 i, cnt = ps_gpu->ai4_grain_ht_in_ctb[ps_gpu->i4_curr_grain_idx];
+ WORD32 ctb_y_idx = 0;
+
+ for(i = 0; i < ps_gpu->i4_curr_grain_idx; i++)
+ ctb_y_idx += ps_gpu->ai4_grain_ht_in_ctb[i];
+
+ if(ps_gpu->i4_curr_grain_idx == 0)
+ cnt--;
+ else if(ps_gpu->i4_curr_grain_idx == (GRANULARITY - 1))
+ cnt++;
+
+ if(ps_gpu->i4_curr_grain_idx != 0)
+ ctb_y_idx--;
+
+ for(i = 0; i < cnt; i++)
+ {
+ s_job.i4_cmd = CMD_PROCESS;
+ s_job.i2_ctb_cnt = (WORD16)ps_sps->i2_pic_wd_in_ctb;
+ s_job.i2_ctb_x = 0; //(WORD16)ps_codec->s_parse.i4_ctb_tile_x;
+ s_job.i2_ctb_y = (WORD16)ctb_y_idx;
+ s_job.i2_slice_idx = (WORD16)ps_codec->s_parse.i4_cur_slice_idx;
+ s_job.i4_tu_coeff_data_ofst = ps_gpu->ai4_tu_coeff_data_ofst[ctb_y_idx];
+ s_job.i2_granularity_idx = ps_gpu->i4_curr_grain_idx;
+ s_job.i2_slice_idx = (WORD16)ps_gpu->ai4_cur_slice_idx[ctb_y_idx];
+
+ printf("Queued ctb y row %d\n", ctb_y_idx);
+
+ if((i == 0) && (ps_codec->u4_gpu_enabled))
+ s_job.i2_wait = 1;
+ else
+ s_job.i2_wait = 0;
+
+ ret = ihevcd_jobq_queue(ps_codec->s_parse.pv_proc_jobq, &s_job, sizeof(proc_job_t), 1);
+ ASSERT(ret == IHEVC_SUCCESS);
+ ctb_y_idx++;
+
+ }
+ }
+#endif
+ ps_gpu->i4_curr_grain_ctb_cnt = 0;
+ ps_gpu->i4_curr_grain_idx++;
+
+ }
+ }
+#endif
+
+ /* If the bytes for the current slice are exhausted
+ * set end_of_slice flag to 1
+ * This slice will be treated as incomplete */
+ if((UWORD8 *)ps_codec->s_parse.s_bitstrm.pu1_buf_max + BITSTRM_OFF_THRS <
+ ((UWORD8 *)ps_codec->s_parse.s_bitstrm.pu4_buf + (ps_codec->s_parse.s_bitstrm.u4_bit_ofst / 8)))
+ {
+ // end_of_slice_flag = ps_codec->i4_slice_error ? 0 : 1;
+
+ if(0 == ps_codec->i4_slice_error)
+ end_of_slice_flag = 1;
+ }
+
+
+ if(end_of_pic)
+ break;
+ } while(!end_of_slice_flag);
+
+ /* Increment the slice index for parsing next slice */
+ if(0 == end_of_pic)
+ {
+#ifdef GPU_BUILD
+ // TODO GPU : The following logic needs different implementation.
+#endif
+ while(1)
+ {
+
+ WORD32 parse_slice_idx;
+#ifdef GPU_BUILD
+ WORD32 min_proc_slice_idx;
+ WORD32 proc_idx = (ps_codec->u4_parsing_view * 2) + (ps_codec->i4_num_cores - 1);
+ /* Identify the min slice index currently in use by processing threads */
+ min_proc_slice_idx = ps_codec->as_process[proc_idx].i4_cur_slice_idx;
+#endif
+ parse_slice_idx = ps_codec->s_parse.i4_cur_slice_idx;
+ parse_slice_idx++;
+
+#if 0
+ for(i = 1; i < (ps_codec->i4_num_cores - 1); i++)
+ {
+ if(ps_codec->as_process[i].i4_cur_slice_idx
+ < min_proc_slice_idx)
+ min_proc_slice_idx =
+ ps_codec->as_process[i].i4_cur_slice_idx;
+
+
+ }
+
+
+ /* If MAX slice header count is reached, then reset the parsing slice idx to zero */
+ if(parse_slice_idx == MAX_SLICE_HDR_CNT)
+ {
+ parse_slice_idx = 0;
+ }
+
+ /* If parse_slice_idx and min_proc_slice_idx are different then break */
+ if(parse_slice_idx != min_proc_slice_idx)
+ {
+ ps_codec->s_parse.i4_cur_slice_idx = parse_slice_idx;
+ break;
+ }
+ else
+ {
+ /* If Processing threads are still using the slice where parsing thread
+ * has to write next slice data, wait for processing threads to consume that slice
+ */
+ ithread_yield();
+ }
+#else
+ {
+ /* If the next slice header is not initialized, update cur_slice_idx and break */
+ if((1 == ps_codec->i4_num_cores) || (0 != (parse_slice_idx & (MAX_SLICE_HDR_CNT - 1))))
+ {
+ ps_codec->s_parse.i4_cur_slice_idx = parse_slice_idx;
+ break;
+ }
+
+ /* If the next slice header is initialised, wait for the parsed slices to be processed */
+ else
+ {
+#ifndef GPU_BUILD
+ WORD32 ctb_indx = 0;
+
+ while(ctb_indx != ps_sps->i4_pic_size_in_ctb)
+ {
+ WORD32 parse_status = *(ps_codec->pu1_parse_map + ctb_indx);
+ WORD32 proc_status = *(ps_codec->pu1_proc_map + ctb_indx) & 1;
+
+ if(parse_status == proc_status)
+ ctb_indx++;
+ }
+ ps_codec->s_parse.i4_cur_slice_idx = parse_slice_idx;
+ break;
+#else
+ printf("\nFix this code for multiCore multi-Slice\n");
+ exit(-1);
+#endif
+ }
+
+ }
+#endif
+ }
+
+ }
+ else
+ {
+#ifdef GPU_BUILD
+ if(1 == ps_codec->i4_num_cores)
+ {
+
+ if(!ps_pps->i1_tiles_enabled_flag)
+ {
+ process_ctxt_t *ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+ WORD32 i;
+ WORD32 tu_coeff_data_ofst = 0;
+ ps_proc->i4_ctb_cnt = total_ctb_cnt;
+ ps_proc->i4_ctb_x = 0; //ps_codec->s_parse.i4_ctb_tile_x;
+ ps_proc->i4_ctb_y = 0; //ps_codec->s_parse.i4_ctb_tile_y;
+ ps_proc->i4_cur_slice_idx = ps_gpu->ai4_cur_slice_idx[0]; //ps_codec->s_parse.i4_cur_slice_idx;
+
+ for(i = 0; i < GRANULARITY; i++)
+ {
+ ps_proc->i4_ctb_cnt = ps_gpu->ai4_ctbs_in_grain[i];
+ total_ctb_cnt -= ps_gpu->ai4_ctbs_in_grain[i];
+
+// if(i == 0)
+// {
+// ps_proc->i4_ctb_cnt -= ps_sps->i2_pic_wd_in_ctb;
+// total_ctb_cnt += ps_sps->i2_pic_wd_in_ctb;
+// }
+// else if(i == (GRANULARITY - 1))
+// {
+// ps_proc->i4_ctb_cnt += ps_sps->i2_pic_wd_in_ctb;
+// //total_ctb_cnt -= ps_sps->i2_pic_wd_in_ctb;
+// }
+
+ // TODO GPU : Buggy don't wait for I-slice.
+ if(ps_codec->u4_gpu_enabled)
+ {
+ ihevcd_gpu_mc_wait(ps_proc, i);
+ }
+
+ //printf("Calling ihevcd_init_proc_ctxt ps_proc->i4_ctb_cnt = %d\n", ps_proc->i4_ctb_cnt);
+
+ ihevcd_init_proc_ctxt(ps_proc, tu_coeff_data_ofst);
+ //printf("ihevcd_process\n");
+ ihevcd_process(ps_proc);
+ tu_coeff_data_ofst = (UWORD8 *)ps_proc->pv_tu_coeff_data - (UWORD8 *)ps_proc->pv_pic_tu_coeff_data;
+ }
+
+
+ }
+ else
+ {
+ process_ctxt_t *ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+ WORD32 i, j, k, l;
+ WORD32 tu_coeff_data_ofst = 0;
+ tile_t *ps_tile = ps_pps->ps_tile;
+
+
+ // ps_proc->i4_cur_slice_idx = ps_gpu->ai4_cur_slice_idx[0];//ps_codec->s_parse.i4_cur_slice_idx;
+ ps_proc->i4_cur_slice_idx = 0;
+
+ i = 0;
+ printf("Processing tile\n");
+ for(j = 0; j < ps_pps->i1_num_tile_rows; j++)
+ {
+ if(ps_gpu->ai4_grain_pos_y[i] == ps_tile->u1_pos_y)
+ {
+ // TODO GPU : Buggy don't wait for I-slice.
+ if(ps_codec->u4_gpu_enabled)
+ {
+ ihevcd_gpu_mc_wait(ps_proc, i);
+ }
+ i++;
+
+ }
+ for(k = 0; k < ps_pps->i1_num_tile_columns; k++)
+ {
+ ps_proc->i4_ctb_x = ps_tile->u1_pos_x;
+ ps_proc->i4_ctb_y = ps_tile->u1_pos_y;
+
+// ps_proc->i4_cur_slice_idx = *(ps_codec->s_parse.pu1_slice_idx + ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb );
+ for(l = 0; l < ps_tile->u2_ht; l++)
+ {
+ ps_proc->i4_ctb_cnt = ps_tile->u2_wd; //* ps_tile->u2_ht;
+
+ ihevcd_init_proc_ctxt(ps_proc, tu_coeff_data_ofst);
+ //printf("ihevcd_process\n");
+ ihevcd_process(ps_proc);
+ tu_coeff_data_ofst = (UWORD8 *)ps_proc->pv_tu_coeff_data - (UWORD8 *)ps_proc->pv_pic_tu_coeff_data;
+ }
+ ps_tile++;
+ }
+ }
+
+
+
+ }
+ }
+#endif
+#if FRAME_ILF_PAD
+ if(FRAME_ILF_PAD && 1 == ps_codec->i4_num_cores)
+ {
+ if(ps_slice_hdr->i4_abs_pic_order_cnt == 0)
+ {
+ DUMP_PRE_ILF(ps_codec->as_process[0].pu1_cur_pic_luma,
+ ps_codec->as_process[0].pu1_cur_pic_chroma,
+ ps_sps->i2_pic_width_in_luma_samples,
+ ps_sps->i2_pic_height_in_luma_samples,
+ ps_codec->i4_strd);
+
+ DUMP_BS(ps_codec->as_process[0].s_bs_ctxt.pu4_pic_vert_bs,
+ ps_codec->as_process[0].s_bs_ctxt.pu4_pic_horz_bs,
+ ps_sps->i2_pic_wd_in_ctb * (ctb_size * ctb_size / 8 / 16) * ps_sps->i2_pic_ht_in_ctb,
+ (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16) * ps_sps->i2_pic_ht_in_ctb);
+
+ DUMP_QP(ps_codec->as_process[0].s_bs_ctxt.pu1_pic_qp,
+ (ps_sps->i2_pic_height_in_luma_samples * ps_sps->i2_pic_width_in_luma_samples) / (MIN_CU_SIZE * MIN_CU_SIZE));
+
+ DUMP_QP_CONST_IN_CTB(ps_codec->as_process[0].s_bs_ctxt.pu1_pic_qp_const_in_ctb,
+ (ps_sps->i2_pic_height_in_luma_samples * ps_sps->i2_pic_width_in_luma_samples) / (MIN_CTB_SIZE * MIN_CTB_SIZE) / 8);
+
+ DUMP_NO_LOOP_FILTER(ps_codec->as_process[0].pu1_pic_no_loop_filter_flag,
+ (ps_sps->i2_pic_width_in_luma_samples / MIN_CU_SIZE) * (ps_sps->i2_pic_height_in_luma_samples / MIN_CU_SIZE) / 8);
+
+ DUMP_OFFSETS(ps_slice_hdr->i1_beta_offset_div2,
+ ps_slice_hdr->i1_tc_offset_div2,
+ ps_pps->i1_pic_cb_qp_offset,
+ ps_pps->i1_pic_cr_qp_offset);
+ }
+ ps_codec->s_parse.s_deblk_ctxt.ps_pps = ps_codec->s_parse.ps_pps;
+ ps_codec->s_parse.s_deblk_ctxt.ps_sps = ps_codec->s_parse.ps_sps;
+ ps_codec->s_parse.s_deblk_ctxt.ps_codec = ps_codec;
+ ps_codec->s_parse.s_deblk_ctxt.ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+ ps_codec->s_parse.s_deblk_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+ ps_codec->s_parse.s_sao_ctxt.ps_pps = ps_codec->s_parse.ps_pps;
+ ps_codec->s_parse.s_sao_ctxt.ps_sps = ps_codec->s_parse.ps_sps;
+ ps_codec->s_parse.s_sao_ctxt.ps_codec = ps_codec;
+ ps_codec->s_parse.s_sao_ctxt.ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+
+ ihevcd_ilf_pad_frame(&ps_codec->s_parse.s_deblk_ctxt, &ps_codec->s_parse.s_sao_ctxt);
+
+ }
+#endif
+ ps_codec->s_parse.i4_end_of_frame = 1;
+ }
+ return ret;
+}
+
+
+
+
+
+
+
+
diff --git a/decoder/ihevcd_parse_slice.h b/decoder/ihevcd_parse_slice.h
new file mode 100644
index 0000000..ca518f6
--- /dev/null
+++ b/decoder/ihevcd_parse_slice.h
@@ -0,0 +1,43 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_slice.h
+*
+* @brief
+* Parsing of slice level data
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_SLICE_H_
+#define _IHEVCD_PARSE_SLICE_H_
+
+
+IHEVCD_ERROR_T ihevcd_parse_mvd(codec_t *ps_codec, mv_t *ps_mv);
+IHEVCD_ERROR_T ihevcd_parse_slice_data(codec_t *ps_codec);
+#endif /* _IHEVCD_PARSE_SLICE_H_ */
diff --git a/decoder/ihevcd_parse_slice_header.c b/decoder/ihevcd_parse_slice_header.c
new file mode 100644
index 0000000..7bb6084
--- /dev/null
+++ b/decoder/ihevcd_parse_slice_header.c
@@ -0,0 +1,1090 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_headers.c
+*
+* @brief
+* Contains functions for parsing headers
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_quant_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_parse_headers.h"
+#include "ihevcd_parse_slice_header.h"
+#include "ihevcd_ref_list.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+
+mv_buf_t* ihevcd_mv_mgr_get_poc(buf_mgr_t *ps_mv_buf_mgr, UWORD32 abs_poc);
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses VPS operation point
+*
+* @par Description
+* Parses VPS operation point as per section 7.3.5
+*
+* @param[out] ps_vps
+* Pointer to VPS structure
+*
+* @param[in] ps_bitstrm
+* Pointer to bitstream structure
+*
+* @param[in] ops_idx
+* Operating point index
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_operation_point_set(vps_t *ps_vps, bitstrm_t *ps_bitstrm, WORD32 ops_idx)
+{
+ WORD32 i;
+ WORD32 value;
+ UNUSED(ops_idx);
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ for(i = 0; i <= ps_vps->i1_vps_max_nuh_reserved_zero_layer_id; i++)
+ {
+ BITS_PARSE("list_entry_l0[ i ]", value, ps_bitstrm, 1);
+ //ps_vps->ai1_layer_id_included_flag[ops_idx][i] = value;
+
+ }
+ UNUSED(value);
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parses pic_lismod_t (picture list mod syntax) Section:7.3.8.3 Reference
+* picture list mod syntax
+*
+* @par Description:
+* Parse pict list mod synt and update pic_lismod_t struct
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+WORD32 ihevcd_ref_pic_list_modification(bitstrm_t *ps_bitstrm,
+ slice_header_t *ps_slice_hdr,
+ WORD32 num_poc_total_curr)
+{
+ WORD32 ret = IHEVCD_SUCCESS;
+ WORD32 value;
+ WORD32 i;
+ rplm_t *ps_rplm;
+ WORD32 num_bits_list_entry;
+
+ ps_rplm = &(ps_slice_hdr->s_rplm);
+
+ /* Calculate Ceil(Log2(num_poc_total_curr)) */
+ {
+ num_bits_list_entry = 32 - CLZ(num_poc_total_curr);
+ /* Check if num_poc_total_curr is power of 2 */
+ if(0 == (num_poc_total_curr & (num_poc_total_curr - 1)))
+ {
+ num_bits_list_entry--;
+ }
+ }
+
+ if(ps_slice_hdr->i1_slice_type == PSLICE || ps_slice_hdr->i1_slice_type == BSLICE)
+ {
+ BITS_PARSE("ref_pic_list_modification_flag_l0", value, ps_bitstrm, 1);
+ ps_rplm->i1_ref_pic_list_modification_flag_l0 = value;
+
+ if(ps_rplm->i1_ref_pic_list_modification_flag_l0)
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l0_active; i++)
+ {
+ BITS_PARSE("list_entry_l0", value, ps_bitstrm, num_bits_list_entry);
+ ps_rplm->i1_list_entry_l0[i] = value;
+
+ ps_rplm->i1_list_entry_l0[i] = CLIP3(ps_rplm->i1_list_entry_l0[i], 0, num_poc_total_curr - 1);
+ }
+ }
+
+ if(ps_slice_hdr->i1_slice_type == BSLICE)
+ {
+ BITS_PARSE("ref_pic_list_modification_flag_l1", value, ps_bitstrm, 1);
+ ps_rplm->i1_ref_pic_list_modification_flag_l1 = value;
+
+ if(ps_rplm->i1_ref_pic_list_modification_flag_l1)
+ for(i = 0; i < ps_slice_hdr->i1_num_ref_idx_l1_active; i++)
+ {
+ BITS_PARSE("list_entry_l1", value, ps_bitstrm, num_bits_list_entry);
+ ps_rplm->i1_list_entry_l1[i] = value;
+
+ ps_rplm->i1_list_entry_l1[i] = CLIP3(ps_rplm->i1_list_entry_l1[i], 0, num_poc_total_curr - 1);
+ }
+
+ }
+
+ return ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Parse Slice Header
+* slice_header_syntax()
+*
+* @par Description:
+* Parse Slice Header as per Section: 7.3.8
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error code from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+
+IHEVCD_ERROR_T ihevcd_parse_slice_header(codec_t *ps_codec,
+ nal_header_t *ps_nal)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 value;
+ WORD32 i;
+ WORD32 sps_id;
+
+ pps_t *ps_pps;
+ sps_t *ps_sps;
+ slice_header_t *ps_slice_hdr;
+ WORD32 disable_deblocking_filter_flag;
+ bitstrm_t *ps_bitstrm = &ps_codec->s_parse.s_bitstrm;
+ WORD32 idr_pic_flag;
+ WORD32 pps_id;
+ WORD32 first_slice_in_pic_flag;
+ WORD32 no_output_of_prior_pics_flag = 0;
+ WORD8 i1_nal_unit_type = ps_nal->i1_nal_unit_type;
+ WORD32 num_poc_total_curr = 0;
+ WORD32 slice_address;
+
+ if(ps_codec->i4_slice_error == 1)
+ return ret;
+
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_codec->s_parse.ps_slice_hdr_base = ps_codec->aps_slice_hdr_base[ps_codec->u4_parsing_view];
+#endif
+ idr_pic_flag = (NAL_IDR_W_LP == i1_nal_unit_type) ||
+ (NAL_IDR_N_LP == i1_nal_unit_type);
+
+
+ BITS_PARSE("first_slice_in_pic_flag", first_slice_in_pic_flag, ps_bitstrm, 1);
+ if((NAL_BLA_W_LP <= i1_nal_unit_type) &&
+ (NAL_RSV_RAP_VCL23 >= i1_nal_unit_type))
+ {
+ BITS_PARSE("no_output_of_prior_pics_flag", no_output_of_prior_pics_flag, ps_bitstrm, 1);
+ }
+ UEV_PARSE("pic_parameter_set_id", pps_id, ps_bitstrm);
+ pps_id = CLIP3(pps_id, 0, MAX_PPS_CNT - 2);
+
+ /* Get the current PPS structure */
+ ps_pps = ps_codec->s_parse.ps_pps_base + pps_id;
+ if(0 == ps_pps->i1_pps_valid)
+ {
+ pps_t *ps_pps_ref = ps_codec->ps_pps_base;
+ while(0 == ps_pps_ref->i1_pps_valid)
+ ps_pps_ref++;
+
+ if((ps_pps_ref - ps_codec->ps_pps_base >= MAX_PPS_CNT - 1))
+ return IHEVCD_INVALID_HEADER;
+
+ ihevcd_copy_pps(ps_codec, pps_id, ps_pps_ref->i1_pps_id);
+ }
+
+ /* Get SPS id for the current PPS */
+ sps_id = ps_pps->i1_sps_id;
+
+ /* Get the current SPS structure */
+ ps_sps = ps_codec->s_parse.ps_sps_base + sps_id;
+
+ /* When the current slice is the first in a pic,
+ * check whether the previous frame is complete
+ * If the previous frame is incomplete -
+ * treat the remaining CTBs as skip */
+ if((0 != ps_codec->u4_pic_cnt || ps_codec->i4_pic_present) &&
+ first_slice_in_pic_flag)
+ {
+ if(ps_codec->i4_pic_present)
+ {
+ slice_header_t *ps_slice_hdr_next;
+ ps_codec->i4_slice_error = 1;
+ ps_codec->s_parse.i4_cur_slice_idx--;
+ if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+ ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+ ps_slice_hdr_next->i2_ctb_x = 0;
+ ps_slice_hdr_next->i2_ctb_y = ps_codec->s_parse.ps_sps->i2_pic_ht_in_ctb;
+ return ret;
+ }
+ else
+ {
+ ps_codec->i4_slice_error = 0;
+ }
+ }
+
+ if(first_slice_in_pic_flag)
+ {
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
+ }
+ else
+ {
+ /* If the current slice is not the first slice in the pic,
+ * but the first one to be parsed, set the current slice indx to 1
+ * Treat the first slice to be missing and copy the current slice header
+ * to the first one */
+ if(0 == ps_codec->i4_pic_present)
+ ps_codec->s_parse.i4_cur_slice_idx = 1;
+ }
+
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+
+#ifdef GPU_BUILD
+ /* OpenCL Ping Pong buffer */
+ // TODO GPU : Find out why this memcpy is required.
+ if(ps_codec->u4_parsing_view == 1)
+ {
+ //ps_slice_hdr += MAX_SLICE_HDR_CNT;
+ memcpy(ps_slice_hdr, ps_slice_hdr - MAX_SLICE_HDR_CNT, sizeof(slice_header_t));
+ }
+ else if(ps_codec->u4_parsing_view == 0)
+ {
+ if(1 != ps_codec->i4_num_cores)
+ memcpy(ps_slice_hdr, ps_slice_hdr + MAX_SLICE_HDR_CNT, sizeof(slice_header_t));
+ }
+#endif
+
+ if((ps_pps->i1_dependent_slice_enabled_flag) &&
+ (!first_slice_in_pic_flag))
+ {
+ BITS_PARSE("dependent_slice_flag", value, ps_bitstrm, 1);
+
+ /* If dependendent slice, copy slice header from previous slice */
+ if(value && (ps_codec->s_parse.i4_cur_slice_idx > 0))
+ {
+ ihevcd_copy_slice_hdr(ps_codec,
+ (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1)),
+ ((ps_codec->s_parse.i4_cur_slice_idx - 1) & (MAX_SLICE_HDR_CNT - 1)));
+ }
+ ps_slice_hdr->i1_dependent_slice_flag = value;
+ }
+ else
+ {
+ ps_slice_hdr->i1_dependent_slice_flag = 0;
+ }
+ ps_slice_hdr->i1_nal_unit_type = i1_nal_unit_type;
+ ps_slice_hdr->i1_pps_id = pps_id;
+ ps_slice_hdr->i1_first_slice_in_pic_flag = first_slice_in_pic_flag;
+
+ ps_slice_hdr->i1_no_output_of_prior_pics_flag = 1;
+ if((NAL_BLA_W_LP <= i1_nal_unit_type) &&
+ (NAL_RSV_RAP_VCL23 >= i1_nal_unit_type))
+ {
+ ps_slice_hdr->i1_no_output_of_prior_pics_flag = no_output_of_prior_pics_flag;
+ }
+ ps_slice_hdr->i1_pps_id = pps_id;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ WORD32 num_bits;
+
+ /* Use CLZ to compute Ceil( Log2( PicSizeInCtbsY ) ) */
+ num_bits = 32 - CLZ(ps_sps->i4_pic_size_in_ctb - 1);
+ BITS_PARSE("slice_address", value, ps_bitstrm, num_bits);
+
+ slice_address = value;
+ /* If slice address is greater than the number of CTBs in a picture,
+ * ignore the slice */
+ if(value >= ps_sps->i4_pic_size_in_ctb)
+ return IHEVCD_IGNORE_SLICE;
+ }
+ else
+ {
+ slice_address = 0;
+ }
+
+ if(!ps_slice_hdr->i1_dependent_slice_flag)
+ {
+ ps_slice_hdr->i1_pic_output_flag = 1;
+ ps_slice_hdr->i4_pic_order_cnt_lsb = 0;
+ ps_slice_hdr->i1_num_long_term_sps = 0;
+ ps_slice_hdr->i1_num_long_term_pics = 0;
+
+ for(i = 0; i < ps_pps->i1_num_extra_slice_header_bits; i++)
+ {
+ BITS_PARSE("slice_reserved_undetermined_flag[ i ]", value, ps_bitstrm, 1);
+ //slice_reserved_undetermined_flag[ i ]
+ }
+ UEV_PARSE("slice_type", value, ps_bitstrm);
+ ps_slice_hdr->i1_slice_type = value;
+
+ /* If the picture is IRAP, slice type must be equal to ISLICE */
+ if((ps_slice_hdr->i1_nal_unit_type >= NAL_BLA_W_LP) &&
+ (ps_slice_hdr->i1_nal_unit_type <= NAL_RSV_RAP_VCL23))
+ ps_slice_hdr->i1_slice_type = ISLICE;
+
+ if((ps_slice_hdr->i1_slice_type < 0) ||
+ (ps_slice_hdr->i1_slice_type > 2))
+ return IHEVCD_IGNORE_SLICE;
+
+ if(ps_pps->i1_output_flag_present_flag)
+ {
+ BITS_PARSE("pic_output_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_pic_output_flag = value;
+ }
+ ps_slice_hdr->i1_colour_plane_id = 0;
+ if(1 == ps_sps->i1_separate_colour_plane_flag)
+ {
+ BITS_PARSE("colour_plane_id", value, ps_bitstrm, 2);
+ ps_slice_hdr->i1_colour_plane_id = value;
+ }
+ ps_slice_hdr->i1_slice_temporal_mvp_enable_flag = 0;
+
+ if(!idr_pic_flag)
+ {
+
+ WORD32 st_rps_idx;
+ WORD32 num_neg_pics;
+ WORD32 num_pos_pics;
+ WORD8 *pi1_used;
+
+ BITS_PARSE("pic_order_cnt_lsb", value, ps_bitstrm, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+ //value = ihevcd_extend_sign_bit(value, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+ ps_slice_hdr->i4_pic_order_cnt_lsb = value;
+
+ BITS_PARSE("short_term_ref_pic_set_sps_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag = value;
+
+ if(1 == ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag)
+ {
+ WORD32 numbits;
+
+ numbits = 32 - CLZ(ps_sps->i1_num_short_term_ref_pic_sets - 1);
+ BITS_PARSE("short_term_ref_pic_set_idx", value, ps_bitstrm, numbits);
+ ps_slice_hdr->i1_short_term_ref_pic_set_idx = value;
+ ps_slice_hdr->i1_short_term_ref_pic_set_idx = CLIP3(ps_slice_hdr->i1_short_term_ref_pic_set_idx, 0, MAX_STREF_PICS_SPS - 1);
+
+ st_rps_idx = ps_slice_hdr->i1_short_term_ref_pic_set_idx;
+ num_neg_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_neg_pics;
+ num_pos_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_pos_pics;
+ pi1_used = ps_sps->as_stref_picset[st_rps_idx].ai1_used;
+ }
+ else
+ {
+ ihevcd_short_term_ref_pic_set(ps_bitstrm,
+ &ps_sps->as_stref_picset[0],
+ ps_sps->i1_num_short_term_ref_pic_sets,
+ ps_sps->i1_num_short_term_ref_pic_sets,
+ &ps_slice_hdr->s_stref_picset);
+
+ st_rps_idx = ps_sps->i1_num_short_term_ref_pic_sets;
+ num_neg_pics = ps_slice_hdr->s_stref_picset.i1_num_neg_pics;
+ num_pos_pics = ps_slice_hdr->s_stref_picset.i1_num_pos_pics;
+ pi1_used = ps_slice_hdr->s_stref_picset.ai1_used;
+ }
+
+ if(ps_sps->i1_long_term_ref_pics_present_flag)
+ {
+ if(ps_sps->i1_num_long_term_ref_pics_sps > 0)
+ {
+ UEV_PARSE("num_long_term_sps", value, ps_bitstrm);
+ ps_slice_hdr->i1_num_long_term_sps = value;
+
+ ps_slice_hdr->i1_num_long_term_sps = CLIP3(ps_slice_hdr->i1_num_long_term_sps,
+ 0, MAX_DPB_SIZE - num_neg_pics - num_pos_pics);
+ }
+ UEV_PARSE("num_long_term_pics", value, ps_bitstrm);
+ ps_slice_hdr->i1_num_long_term_pics = value;
+ ps_slice_hdr->i1_num_long_term_pics = CLIP3(ps_slice_hdr->i1_num_long_term_pics,
+ 0, MAX_DPB_SIZE - num_neg_pics - num_pos_pics -
+ ps_slice_hdr->i1_num_long_term_sps);
+
+ for(i = 0; i < (ps_slice_hdr->i1_num_long_term_sps +
+ ps_slice_hdr->i1_num_long_term_pics); i++)
+ {
+ if(i < ps_slice_hdr->i1_num_long_term_sps)
+ {
+ /* Use CLZ to compute Ceil( Log2( num_long_term_ref_pics_sps ) ) */
+ WORD32 num_bits = 32 - CLZ(ps_sps->i1_num_long_term_ref_pics_sps);
+ BITS_PARSE("lt_idx_sps[ i ]", value, ps_bitstrm, num_bits);
+ ps_slice_hdr->ai4_poc_lsb_lt[i] = ps_sps->ai1_lt_ref_pic_poc_lsb_sps[value];
+ ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i] = ps_sps->ai1_used_by_curr_pic_lt_sps_flag[value];
+
+ }
+ else
+ {
+ BITS_PARSE("poc_lsb_lt[ i ]", value, ps_bitstrm, ps_sps->i1_log2_max_pic_order_cnt_lsb);
+ ps_slice_hdr->ai4_poc_lsb_lt[i] = value;
+
+ BITS_PARSE("used_by_curr_pic_lt_flag[ i ]", value, ps_bitstrm, 1);
+ ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i] = value;
+
+ }
+ BITS_PARSE("delta_poc_msb_present_flag[ i ]", value, ps_bitstrm, 1);
+ ps_slice_hdr->ai1_delta_poc_msb_present_flag[i] = value;
+
+
+ ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] = 0;
+ if(ps_slice_hdr->ai1_delta_poc_msb_present_flag[i])
+ {
+
+ UEV_PARSE("delata_poc_msb_cycle_lt[ i ]", value, ps_bitstrm);
+ ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] = value;
+ }
+
+ if((i != 0) && (i != ps_slice_hdr->i1_num_long_term_sps))
+ {
+ ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] += ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i - 1];
+ }
+
+ }
+ }
+
+ for(i = 0; i < num_neg_pics + num_pos_pics; i++)
+ {
+ if(pi1_used[i])
+ {
+ num_poc_total_curr++;
+ }
+ }
+ for(i = 0; i < ps_slice_hdr->i1_num_long_term_sps + ps_slice_hdr->i1_num_long_term_pics; i++)
+ {
+ if(ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i])
+ {
+ num_poc_total_curr++;
+ }
+ }
+
+
+ if(ps_sps->i1_sps_temporal_mvp_enable_flag)
+ {
+ BITS_PARSE("enable_temporal_mvp_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_slice_temporal_mvp_enable_flag = value;
+ }
+
+ }
+ ps_slice_hdr->i1_slice_sao_luma_flag = 0;
+ ps_slice_hdr->i1_slice_sao_chroma_flag = 0;
+ if(ps_sps->i1_sample_adaptive_offset_enabled_flag)
+ {
+ BITS_PARSE("slice_sao_luma_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_slice_sao_luma_flag = value;
+
+ BITS_PARSE("slice_sao_chroma_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_slice_sao_chroma_flag = value;
+
+ }
+
+ ps_slice_hdr->i1_max_num_merge_cand = 1;
+ ps_slice_hdr->i1_cabac_init_flag = 0;
+
+ ps_slice_hdr->i1_num_ref_idx_l0_active = 0;
+ ps_slice_hdr->i1_num_ref_idx_l1_active = 0;
+ ps_slice_hdr->i1_slice_cb_qp_offset = 0;
+ ps_slice_hdr->i1_slice_cr_qp_offset = 0;
+ if((PSLICE == ps_slice_hdr->i1_slice_type) ||
+ (BSLICE == ps_slice_hdr->i1_slice_type))
+ {
+ BITS_PARSE("num_ref_idx_active_override_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_num_ref_idx_active_override_flag = value;
+
+ if(ps_slice_hdr->i1_num_ref_idx_active_override_flag)
+ {
+ UEV_PARSE("num_ref_idx_l0_active_minus1", value, ps_bitstrm);
+ ps_slice_hdr->i1_num_ref_idx_l0_active = value + 1;
+
+ if(BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ UEV_PARSE("num_ref_idx_l1_active_minus1", value, ps_bitstrm);
+ ps_slice_hdr->i1_num_ref_idx_l1_active = value + 1;
+ }
+
+ }
+ else
+ {
+ ps_slice_hdr->i1_num_ref_idx_l0_active = ps_pps->i1_num_ref_idx_l0_default_active;
+
+ if(BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ ps_slice_hdr->i1_num_ref_idx_l1_active = ps_pps->i1_num_ref_idx_l1_default_active;
+ }
+ }
+
+ ps_slice_hdr->i1_num_ref_idx_l0_active = CLIP3(ps_slice_hdr->i1_num_ref_idx_l0_active, 0, MAX_DPB_SIZE - 1);
+ ps_slice_hdr->i1_num_ref_idx_l1_active = CLIP3(ps_slice_hdr->i1_num_ref_idx_l1_active, 0, MAX_DPB_SIZE - 1);
+
+ if(0 == num_poc_total_curr)
+ return IHEVCD_IGNORE_SLICE;
+ if((ps_pps->i1_lists_modification_present_flag) && (num_poc_total_curr > 1))
+ {
+ ihevcd_ref_pic_list_modification(ps_bitstrm,
+ ps_slice_hdr, num_poc_total_curr);
+ }
+ else
+ {
+ ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l0 = 0;
+ ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l1 = 0;
+ }
+
+ if(BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ BITS_PARSE("mvd_l1_zero_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_mvd_l1_zero_flag = value;
+ }
+
+ ps_slice_hdr->i1_cabac_init_flag = 0;
+ if(ps_pps->i1_cabac_init_present_flag)
+ {
+ BITS_PARSE("cabac_init_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_cabac_init_flag = value;
+
+ }
+ ps_slice_hdr->i1_collocated_from_l0_flag = 1;
+ ps_slice_hdr->i1_collocated_ref_idx = 0;
+ if(ps_slice_hdr->i1_slice_temporal_mvp_enable_flag)
+ {
+ if(BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ BITS_PARSE("collocated_from_l0_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_collocated_from_l0_flag = value;
+ }
+
+ if((ps_slice_hdr->i1_collocated_from_l0_flag && (ps_slice_hdr->i1_num_ref_idx_l0_active > 1)) ||
+ (!ps_slice_hdr->i1_collocated_from_l0_flag && (ps_slice_hdr->i1_num_ref_idx_l1_active > 1)))
+ {
+ UEV_PARSE("collocated_ref_idx", value, ps_bitstrm);
+ ps_slice_hdr->i1_collocated_ref_idx = value;
+ }
+
+ }
+ ps_slice_hdr->i1_collocated_ref_idx = CLIP3(ps_slice_hdr->i1_collocated_ref_idx, 0, MAX_DPB_SIZE - 1);
+
+ if((ps_pps->i1_weighted_pred_flag && (PSLICE == ps_slice_hdr->i1_slice_type)) ||
+ (ps_pps->i1_weighted_bipred_flag && (BSLICE == ps_slice_hdr->i1_slice_type)))
+ {
+ ihevcd_parse_pred_wt_ofst(ps_bitstrm, ps_sps, ps_pps, ps_slice_hdr);
+ }
+ UEV_PARSE("five_minus_max_num_merge_cand", value, ps_bitstrm);
+ ps_slice_hdr->i1_max_num_merge_cand = 5 - value;
+
+ }
+ ps_slice_hdr->i1_max_num_merge_cand = CLIP3(ps_slice_hdr->i1_max_num_merge_cand, 1, 5);
+ SEV_PARSE("slice_qp_delta", value, ps_bitstrm);
+ ps_slice_hdr->i1_slice_qp_delta = value;
+
+ if(ps_pps->i1_pic_slice_level_chroma_qp_offsets_present_flag)
+ {
+ SEV_PARSE("slice_cb_qp_offset", value, ps_bitstrm);
+ ps_slice_hdr->i1_slice_cb_qp_offset = value;
+
+ SEV_PARSE("slice_cr_qp_offset", value, ps_bitstrm);
+ ps_slice_hdr->i1_slice_cr_qp_offset = value;
+
+ }
+ ps_slice_hdr->i1_deblocking_filter_override_flag = 0;
+ ps_slice_hdr->i1_slice_disable_deblocking_filter_flag = ps_pps->i1_pic_disable_deblocking_filter_flag;
+ ps_slice_hdr->i1_beta_offset_div2 = ps_pps->i1_beta_offset_div2;
+ ps_slice_hdr->i1_tc_offset_div2 = ps_pps->i1_tc_offset_div2;
+
+ disable_deblocking_filter_flag = ps_pps->i1_pic_disable_deblocking_filter_flag;
+
+ if(ps_pps->i1_deblocking_filter_control_present_flag)
+ {
+
+ if(ps_pps->i1_deblocking_filter_override_enabled_flag)
+ {
+ BITS_PARSE("deblocking_filter_override_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_deblocking_filter_override_flag = value;
+ }
+
+ if(ps_slice_hdr->i1_deblocking_filter_override_flag)
+ {
+ BITS_PARSE("slice_disable_deblocking_filter_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_slice_disable_deblocking_filter_flag = value;
+ disable_deblocking_filter_flag = ps_slice_hdr->i1_slice_disable_deblocking_filter_flag;
+
+ if(!ps_slice_hdr->i1_slice_disable_deblocking_filter_flag)
+ {
+ SEV_PARSE("beta_offset_div2", value, ps_bitstrm);
+ ps_slice_hdr->i1_beta_offset_div2 = value;
+
+ SEV_PARSE("tc_offset_div2", value, ps_bitstrm);
+ ps_slice_hdr->i1_tc_offset_div2 = value;
+
+ }
+ }
+ }
+
+ ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag = ps_pps->i1_loop_filter_across_slices_enabled_flag;
+ if(ps_pps->i1_loop_filter_across_slices_enabled_flag &&
+ (ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag || !disable_deblocking_filter_flag))
+ {
+ BITS_PARSE("slice_loop_filter_across_slices_enabled_flag", value, ps_bitstrm, 1);
+ ps_slice_hdr->i1_slice_loop_filter_across_slices_enabled_flag = value;
+ }
+
+ }
+
+ /* Check sanity of slice */
+ if((!first_slice_in_pic_flag) &&
+ (ps_codec->i4_pic_present))
+ {
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ slice_header_t *ps_slice_hdr_base = ps_codec->aps_slice_hdr_base[ps_codec->u4_parsing_view];
+#else
+ slice_header_t *ps_slice_hdr_base = ps_codec->ps_slice_hdr_base;
+#endif
+
+#if 0
+ if((ps_slice_hdr_base->i1_pps_id != ps_slice_hdr->i1_pps_id) ||
+ (ps_slice_hdr_base->i1_pic_output_flag != ps_slice_hdr->i1_pic_output_flag) ||
+ (ps_slice_hdr_base->i1_no_output_of_prior_pics_flag != ps_slice_hdr->i1_no_output_of_prior_pics_flag) ||
+ (ps_slice_hdr_base->i4_pic_order_cnt_lsb != ps_slice_hdr->i4_pic_order_cnt_lsb) ||
+ (ps_slice_hdr_base->i1_short_term_ref_pic_set_sps_flag != ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag) ||
+ (ps_slice_hdr_base->i1_short_term_ref_pic_set_idx != ps_slice_hdr->i1_short_term_ref_pic_set_idx) ||
+ (ps_slice_hdr_base->i1_num_long_term_sps != ps_slice_hdr->i1_num_long_term_sps) ||
+ (ps_slice_hdr_base->i1_num_long_term_pics != ps_slice_hdr->i1_num_long_term_pics) ||
+ (ps_slice_hdr_base->i1_slice_temporal_mvp_enable_flag != ps_slice_hdr->i1_slice_temporal_mvp_enable_flag))
+ {
+ return IHEVCD_IGNORE_SLICE;
+ }
+#else
+
+ /* According to the standard, the above conditions must be satisfied - But for error resilience,
+ * only the following conditions are checked */
+ if((ps_slice_hdr_base->i1_pps_id != ps_slice_hdr->i1_pps_id) ||
+ (ps_slice_hdr_base->i4_pic_order_cnt_lsb != ps_slice_hdr->i4_pic_order_cnt_lsb))
+ {
+ return IHEVCD_IGNORE_SLICE;
+ }
+#endif
+
+ }
+
+
+ if(0 == ps_codec->i4_pic_present)
+ {
+ ps_slice_hdr->i4_abs_pic_order_cnt = ihevcd_calc_poc(ps_codec, ps_nal, ps_sps->i1_log2_max_pic_order_cnt_lsb, ps_slice_hdr->i4_pic_order_cnt_lsb);
+ }
+ else
+ {
+ ps_slice_hdr->i4_abs_pic_order_cnt = ps_codec->s_parse.i4_abs_pic_order_cnt;
+ }
+
+
+ if(!first_slice_in_pic_flag)
+ {
+ /* Check if the current slice belongs to the same pic (Pic being parsed) */
+ if(ps_codec->s_parse.i4_abs_pic_order_cnt == ps_slice_hdr->i4_abs_pic_order_cnt)
+ {
+
+ /* If the Next CTB's index is less than the slice address,
+ * the previous slice is incomplete.
+ * Indicate slice error, and treat the remaining CTBs as skip */
+ if(slice_address > ps_codec->s_parse.i4_next_ctb_indx)
+ {
+ if(ps_codec->i4_pic_present)
+ {
+ ps_codec->i4_slice_error = 1;
+ ps_codec->s_parse.i4_cur_slice_idx--;
+ if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+ return ret;
+ }
+ else
+ {
+ return IHEVCD_IGNORE_SLICE;
+ }
+ }
+ /* If the slice address is less than the next CTB's index,
+ * extra CTBs have been decoded in the previous slice.
+ * Ignore the current slice. Treat it as incomplete */
+ else if(slice_address < ps_codec->s_parse.i4_next_ctb_indx)
+ {
+ return IHEVCD_IGNORE_SLICE;
+ }
+ else
+ {
+ ps_codec->i4_slice_error = 0;
+ }
+ }
+
+ /* The current slice does not belong to the pic that is being parsed */
+ else
+ {
+ /* The previous pic is incomplete.
+ * Treat the remaining CTBs as skip */
+ if(ps_codec->i4_pic_present)
+ {
+ slice_header_t *ps_slice_hdr_next;
+ ps_codec->i4_slice_error = 1;
+ ps_codec->s_parse.i4_cur_slice_idx--;
+ if(ps_codec->s_parse.i4_cur_slice_idx < 0)
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+ ps_slice_hdr_next = ps_codec->s_parse.ps_slice_hdr_base + ((ps_codec->s_parse.i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+ ps_slice_hdr_next->i2_ctb_x = 0;
+ ps_slice_hdr_next->i2_ctb_y = ps_codec->s_parse.ps_sps->i2_pic_ht_in_ctb;
+ return ret;
+ }
+
+ /* If the previous pic is complete,
+ * return if the current slice is dependant
+ * otherwise, update the parse context's POC */
+ else
+ {
+ if(ps_slice_hdr->i1_dependent_slice_flag)
+ return IHEVCD_IGNORE_SLICE;
+
+ ps_codec->s_parse.i4_abs_pic_order_cnt = ps_slice_hdr->i4_abs_pic_order_cnt;
+ }
+ }
+ }
+
+ /* If the slice is the first slice in the pic, update the parse context's POC */
+ else
+ {
+ /* If the first slice is repeated, ignore the second occurrence
+ * If any other slice is repeated, the CTB addr will be greater than the slice addr,
+ * and hence the second occurrence is ignored */
+ if(ps_codec->s_parse.i4_abs_pic_order_cnt == ps_slice_hdr->i4_abs_pic_order_cnt)
+ return IHEVCD_IGNORE_SLICE;
+
+ ps_codec->s_parse.i4_abs_pic_order_cnt = ps_slice_hdr->i4_abs_pic_order_cnt;
+ }
+
+ // printf("POC: %d\n", ps_slice_hdr->i4_abs_pic_order_cnt);
+ // AEV_TRACE("POC", ps_slice_hdr->i4_abs_pic_order_cnt, 0);
+ ps_slice_hdr->i4_num_entry_point_offsets = 0;
+ if((ps_pps->i1_tiles_enabled_flag) ||
+ (ps_pps->i1_entropy_coding_sync_enabled_flag))
+ {
+ UEV_PARSE("num_entry_point_offsets", value, ps_bitstrm);
+ ps_slice_hdr->i4_num_entry_point_offsets = value;
+
+ {
+ WORD32 max_num_entry_point_offsets;
+ if((ps_pps->i1_tiles_enabled_flag) &&
+ (ps_pps->i1_entropy_coding_sync_enabled_flag))
+ {
+ max_num_entry_point_offsets = ps_pps->i1_num_tile_columns * (ps_sps->i2_pic_ht_in_ctb - 1);
+ }
+ else if(ps_pps->i1_tiles_enabled_flag)
+ {
+ max_num_entry_point_offsets = ps_pps->i1_num_tile_columns * ps_pps->i1_num_tile_rows;
+ }
+ else
+ {
+ max_num_entry_point_offsets = (ps_sps->i2_pic_ht_in_ctb - 1);
+ }
+
+ ps_slice_hdr->i4_num_entry_point_offsets = CLIP3(ps_slice_hdr->i4_num_entry_point_offsets,
+ 0, max_num_entry_point_offsets);
+ }
+
+ if(ps_slice_hdr->i4_num_entry_point_offsets > 0)
+ {
+ UEV_PARSE("offset_len_minus1", value, ps_bitstrm);
+ ps_slice_hdr->i1_offset_len = value + 1;
+
+ for(i = 0; i < ps_slice_hdr->i4_num_entry_point_offsets; i++)
+ {
+ BITS_PARSE("entry_point_offset", value, ps_bitstrm, ps_slice_hdr->i1_offset_len);
+
+ /* TODO: pu4_entry_point_offset needs to be initialized */
+ //ps_slice_hdr->pu4_entry_point_offset[i] = value;
+ }
+
+ }
+ }
+
+ if(ps_pps->i1_slice_header_extension_present_flag)
+ {
+ UEV_PARSE("slice_header_extension_length", value, ps_bitstrm);
+ ps_slice_hdr->i2_slice_header_extension_length = value;
+
+
+ for(i = 0; i < ps_slice_hdr->i2_slice_header_extension_length; i++)
+ {
+ BITS_PARSE("slice_header_extension_data_byte", value, ps_bitstrm, 8);
+ }
+
+ }
+
+ ihevcd_bits_flush_to_byte_boundary(ps_bitstrm);
+
+ {
+ dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+ WORD32 r_idx;
+
+ if((NAL_IDR_W_LP == ps_slice_hdr->i1_nal_unit_type) ||
+ (NAL_IDR_N_LP == ps_slice_hdr->i1_nal_unit_type) ||
+ (NAL_BLA_N_LP == ps_slice_hdr->i1_nal_unit_type) ||
+ (NAL_BLA_W_DLP == ps_slice_hdr->i1_nal_unit_type) ||
+ (NAL_BLA_W_LP == ps_slice_hdr->i1_nal_unit_type) ||
+ (0 == ps_codec->u4_pic_cnt))
+ {
+#ifdef GPU_BUILD
+ /* TODO GPU : Following fix not tested. */
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+ ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref = UNUSED_FOR_REF;
+ }
+
+#else
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+ {
+ pic_buf_t *ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+ mv_buf_t *ps_mv_buf;
+
+ /* Long term index is set to MAX_DPB_BUFS to ensure it is not added as LT */
+ ihevc_dpb_mgr_del_ref((dpb_mgr_t *)ps_codec->pv_dpb_mgr, (buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf->i4_abs_poc);
+ /* Find buffer id of the MV bank corresponding to the buffer being freed (Buffer with POC of u4_abs_poc) */
+ ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
+ for(i = 0; i < BUF_MGR_MAX_CNT; i++)
+ {
+ if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+ {
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
+ break;
+ }
+ ps_mv_buf++;
+ }
+
+ }
+
+ }
+
+ /* Initialize the reference lists to NULL
+ * This is done to take care of the cases where the first pic is not IDR
+ * but the reference list is not created for the first pic because
+ * pic count is zero leaving the reference list uninitialised */
+ for(r_idx = 0; r_idx < MAX_DPB_SIZE; r_idx++)
+ {
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = NULL;
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = NULL;
+
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = NULL;
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = NULL;
+ }
+
+#endif
+ }
+ else
+ {
+ WORD32 ret;
+ ret = ihevcd_ref_list(ps_codec, ps_pps, ps_sps, ps_slice_hdr);
+
+ if(IHEVCD_REF_PIC_NOT_FOUND == ret)
+ return IHEVCD_IGNORE_SLICE;
+ }
+
+ }
+
+ /* Fill the remaining entries of the reference lists with the nearest POC
+ * This is done to handle cases where there is a corruption in the reference index */
+ if(ps_codec->i4_pic_present)
+ {
+ pic_buf_t *ps_pic_buf_ref;
+ mv_buf_t *ps_mv_buf_ref;
+ WORD32 r_idx;
+ dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+ buf_mgr_t *ps_mv_buf_mgr = (buf_mgr_t *)ps_codec->pv_mv_buf_mgr;
+
+ ps_pic_buf_ref = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ps_slice_hdr->i4_abs_pic_order_cnt);
+ if(NULL == ps_pic_buf_ref)
+ {
+ ps_pic_buf_ref = ps_codec->as_process[0].ps_cur_pic;
+ ps_mv_buf_ref = ps_codec->s_parse.ps_cur_mv_buf;
+ }
+ else
+ {
+ ps_mv_buf_ref = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf_ref->i4_abs_poc);
+ }
+
+ for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx++)
+ {
+ if(NULL == ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf)
+ {
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+ }
+
+ for(r_idx = ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx < MAX_DPB_SIZE; r_idx++)
+ {
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+
+ for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx++)
+ {
+ if(NULL == ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf)
+ {
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+ }
+
+ for(r_idx = ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx < MAX_DPB_SIZE; r_idx++)
+ {
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+ }
+
+ /* Update slice address in the header */
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ ps_slice_hdr->i2_ctb_x = slice_address % ps_sps->i2_pic_wd_in_ctb;
+ ps_slice_hdr->i2_ctb_y = slice_address / ps_sps->i2_pic_wd_in_ctb;
+
+ if(!ps_slice_hdr->i1_dependent_slice_flag)
+ {
+ ps_slice_hdr->i2_independent_ctb_x = ps_slice_hdr->i2_ctb_x;
+ ps_slice_hdr->i2_independent_ctb_y = ps_slice_hdr->i2_ctb_y;
+ }
+ }
+ else
+ {
+ ps_slice_hdr->i2_ctb_x = 0;
+ ps_slice_hdr->i2_ctb_y = 0;
+
+ ps_slice_hdr->i2_independent_ctb_x = 0;
+ ps_slice_hdr->i2_independent_ctb_y = 0;
+ }
+
+ /* If the first slice in the pic is missing, copy the current slice header to
+ * the first slice's header */
+ if((!first_slice_in_pic_flag) &&
+ (0 == ps_codec->i4_pic_present))
+ {
+ slice_header_t *ps_slice_hdr_prev = ps_codec->s_parse.ps_slice_hdr_base;
+ ihevcd_copy_slice_hdr(ps_codec, 0, (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1)));
+
+ ps_codec->i4_slice_error = 1;
+
+ ps_slice_hdr_prev->i2_ctb_x = 0;
+ ps_slice_hdr_prev->i2_ctb_y = 0;
+
+ ps_codec->s_parse.i4_ctb_x = 0;
+ ps_codec->s_parse.i4_ctb_y = 0;
+
+ ps_codec->s_parse.i4_cur_slice_idx = 0;
+
+ if((ps_slice_hdr->i2_ctb_x == 0) &&
+ (ps_slice_hdr->i2_ctb_y == 0))
+ {
+ ps_slice_hdr->i2_ctb_x++;
+ }
+ }
+
+ {
+ /* If skip B is enabled,
+ * ignore pictures that are non-reference
+ * TODO: (i1_nal_unit_type < NAL_BLA_W_LP) && (i1_nal_unit_type % 2 == 0) only says it is
+ * sub-layer non-reference slice. May need to find a way to detect actual non-reference pictures*/
+
+ if((i1_nal_unit_type < NAL_BLA_W_LP) &&
+ (i1_nal_unit_type % 2 == 0))
+ {
+ if(IVD_SKIP_B == ps_codec->e_pic_skip_mode)
+ return IHEVCD_IGNORE_SLICE;
+ }
+
+ /* If skip PB is enabled,
+ * decode only I slices */
+ if((IVD_SKIP_PB == ps_codec->e_pic_skip_mode) &&
+ (ISLICE != ps_slice_hdr->i1_slice_type))
+ {
+ return IHEVCD_IGNORE_SLICE;
+ }
+ }
+
+ return ret;
+}
diff --git a/decoder/ihevcd_parse_slice_header.h b/decoder/ihevcd_parse_slice_header.h
new file mode 100644
index 0000000..6f085b7
--- /dev/null
+++ b/decoder/ihevcd_parse_slice_header.h
@@ -0,0 +1,53 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_slice_header.h
+*
+* @brief
+* Parsing of slice header
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PARSE_SLICE_HEADER_H_
+#define _IHEVCD_PARSE_SLICE_HEADER_H_
+
+IHEVCD_ERROR_T ihevcd_short_term_ref_pic_set(bitstrm_t *ps_bitstrm,
+ stref_picset_t *ps_stref_picset_base,
+ WORD32 num_short_term_ref_pic_sets,
+ WORD32 idx,
+ stref_picset_t *ps_stref_picset);
+
+WORD32 ihevcd_parse_pred_wt_ofst(bitstrm_t *ps_bitstrm,
+ sps_t *ps_sps,
+ pps_t *ps_pps,
+ slice_header_t *ps_slice_hdr);
+
+WORD32 ihevcd_calc_poc(codec_t *ps_codec, nal_header_t *ps_nal, WORD8 i1_log2_max_poc_lsb, WORD32 i2_poc_lsb);
+
+
+
+#endif /* _IHEVCD_PARSE_SLICE_HEADER_H_ */
diff --git a/decoder/ihevcd_process_slice.c b/decoder/ihevcd_process_slice.c
new file mode 100644
index 0000000..83aed05
--- /dev/null
+++ b/decoder/ihevcd_process_slice.c
@@ -0,0 +1,1738 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_process_slice.c
+ *
+ * @brief
+ * Contains functions for processing slice data
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_padding.h"
+#include "ihevc_iquant_itrans_recon.h"
+#include "ihevc_chroma_iquant_itrans_recon.h"
+#include "ihevc_recon.h"
+#include "ihevc_chroma_recon.h"
+#include "ihevc_iquant_recon.h"
+#include "ihevc_chroma_iquant_recon.h"
+#include "ihevc_intra_pred.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+#include "ihevc_quant_tables.h"
+#include "ihevcd_common_tables.h"
+
+#include "ihevcd_profile.h"
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_get_mv.h"
+#include "ihevcd_inter_pred.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_boundary_strength.h"
+#include "ihevcd_deblk.h"
+#include "ihevcd_fmt_conv.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#include "ihevcd_sao.h"
+#include "ihevcd_profile.h"
+
+IHEVCD_ERROR_T ihevcd_fmt_conv(codec_t *ps_codec,
+ process_ctxt_t *ps_proc,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_u_dst,
+ UWORD8 *pu1_v_dst,
+ WORD32 cur_row,
+ WORD32 num_rows);
+
+typedef enum
+{
+ PROC_ALL,
+ PROC_INTER_PRED,
+ PROC_RECON,
+ PROC_DEBLK,
+ PROC_SAO
+}proc_type_t;
+
+void ihevcd_proc_map_check(process_ctxt_t *ps_proc, proc_type_t proc_type, WORD32 nctb)
+{
+ tile_t *ps_tile = ps_proc->ps_tile;
+ sps_t *ps_sps = ps_proc->ps_sps;
+ pps_t *ps_pps = ps_proc->ps_pps;
+ codec_t *ps_codec = ps_proc->ps_codec;
+ WORD32 idx;
+ WORD32 nop_cnt;
+ WORD32 bit_pos = proc_type;
+ WORD32 bit_mask = (1 << bit_pos);
+
+ if(ps_proc->i4_check_proc_status)
+ {
+ nop_cnt = PROC_NOP_CNT;
+ while(1)
+ {
+ volatile UWORD8 *pu1_buf;
+ volatile WORD32 status;
+ status = 1;
+ /* Check if all dependencies for the next nCTBs are met */
+ {
+ WORD32 x_pos;
+
+ {
+ /* Check if the top right of next nCTBs are processed */
+ if(ps_proc->i4_ctb_y > 0)
+ {
+ x_pos = (ps_proc->i4_ctb_tile_x + nctb);
+ idx = MIN(x_pos, (ps_tile->u2_wd - 1));
+
+ /* Check if top-right CTB for the last CTB in nCTB is within the tile */
+ {
+ idx += ps_tile->u1_pos_x;
+ idx += ((ps_proc->i4_ctb_y - 1)
+ * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+ pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+ status = *pu1_buf & bit_mask;
+ }
+ }
+ }
+
+ /* If tiles are enabled, then test left and top-left as well */
+ ps_pps = ps_proc->ps_pps;
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /*Check if left ctb is processed*/
+ if((ps_proc->i4_ctb_x > 0) && ((0 != status)))
+ {
+ x_pos = ps_tile->u1_pos_x + ps_proc->i4_ctb_tile_x - 1;
+ idx = x_pos + (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+ pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+ status = *pu1_buf & bit_mask;
+ }
+
+ /*Check if top left ctb is processed*/
+ if((ps_proc->i4_ctb_x > 0) && (0 != status) && (ps_proc->i4_ctb_y > 0))
+ {
+ x_pos = ps_tile->u1_pos_x + ps_proc->i4_ctb_tile_x - 1;
+ idx = x_pos + ((ps_proc->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+ pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+ status = *pu1_buf & bit_mask;
+ }
+ }
+ }
+
+ if(status)
+ break;
+
+ /* if dependencies are not met, then wait for few cycles.
+ * Even after few iterations, if the dependencies are not met then yield
+ */
+ if(nop_cnt > 0)
+ {
+ NOP(128);
+ nop_cnt -= 128;
+ }
+ else
+ {
+ nop_cnt = PROC_NOP_CNT;
+ ithread_yield();
+ //NOP(128 * 16);
+ }
+ }
+ }
+}
+
+void ihevcd_proc_map_update(process_ctxt_t *ps_proc, proc_type_t proc_type, WORD32 nctb)
+{
+ codec_t *ps_codec = ps_proc->ps_codec;
+ WORD32 i, idx;
+ WORD32 bit_pos = proc_type;
+ WORD32 bit_mask = (1 << bit_pos);
+
+ /* Update the current CTBs processing status */
+ if(ps_proc->i4_check_proc_status)
+ {
+ for(i = 0; i < nctb; i++)
+ {
+ sps_t *ps_sps = ps_proc->ps_sps;
+ UWORD8 *pu1_buf;
+ idx = (ps_proc->i4_ctb_x + i);
+ idx += ((ps_proc->i4_ctb_y) * ps_sps->i2_pic_wd_in_ctb);
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ pu1_buf = (ps_proc->pu1_proc_map + idx);
+#else
+ pu1_buf = (ps_codec->pu1_proc_map + idx);
+#endif
+ *pu1_buf = *pu1_buf | bit_mask;
+ }
+ }
+}
+
+
+void ihevcd_slice_hdr_update(process_ctxt_t *ps_proc)
+{
+
+ /* Slice x and y are initialized in proc_init. But initialize slice x and y count here
+ * if a new slice begins at the middle of a row since proc_init is invoked only at the beginning of each row */
+ if(!((ps_proc->i4_ctb_x == 0) && (ps_proc->i4_ctb_y == 0)))
+ {
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ slice_header_t *ps_slice_hdr_next = ps_proc->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+#else
+ slice_header_t *ps_slice_hdr_next = ps_proc->ps_codec->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
+#endif
+
+ if((ps_slice_hdr_next->i2_ctb_x == ps_proc->i4_ctb_x)
+ && (ps_slice_hdr_next->i2_ctb_y == ps_proc->i4_ctb_y))
+ {
+ if(0 == ps_slice_hdr_next->i1_dependent_slice_flag)
+ {
+ ps_proc->i4_ctb_slice_x = 0;
+ ps_proc->i4_ctb_slice_y = 0;
+ }
+
+ ps_proc->i4_cur_slice_idx++;
+ ps_proc->ps_slice_hdr = ps_slice_hdr_next;
+ }
+
+ }
+}
+
+void ihevcd_ctb_pos_update(process_ctxt_t *ps_proc, WORD32 nctb)
+{
+ WORD32 tile_start_ctb_idx, slice_start_ctb_idx;
+ slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+ tile_t *ps_tile = ps_proc->ps_tile;
+ sps_t *ps_sps = ps_proc->ps_sps;
+
+ /* Update x and y positions */
+ ps_proc->i4_ctb_tile_x += nctb;
+ ps_proc->i4_ctb_x += nctb;
+
+ ps_proc->i4_ctb_slice_x += nctb;
+ /*If tile are enabled, then handle the tile & slice counters differently*/
+ if(ps_proc->ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Update slice counters*/
+ slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+ /*
+ * There can be 2 cases where slice counters must be handled differently.
+ * 1 - Multiple tiles span across a single/one of the many slice.
+ * 2 - Multiple slices span across a single/one of the many tiles.
+ */
+
+ /*Case 1 */
+ if(slice_start_ctb_idx < tile_start_ctb_idx)
+ {
+ /*End of tile row*/
+ if(ps_proc->i4_ctb_x > ps_slice_hdr->i2_ctb_x)
+ {
+ if(ps_proc->i4_ctb_slice_x >= (ps_tile->u2_wd + ps_tile->u1_pos_x))
+ {
+ ps_proc->i4_ctb_slice_y++;
+ ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
+ - ps_tile->u2_wd;
+ }
+ }
+ else
+ {
+ WORD32 temp_stride = (ps_sps->i2_pic_wd_in_ctb - ps_slice_hdr->i2_ctb_x);
+ if(ps_proc->i4_ctb_slice_x >= (temp_stride + ps_tile->u2_wd + ps_tile->u1_pos_x))
+ {
+ ps_proc->i4_ctb_slice_y++;
+ ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
+ - ps_tile->u2_wd;
+ }
+ }
+ }
+ /*Case 2*/
+ else if(ps_proc->i4_ctb_slice_x >= (ps_tile->u2_wd))
+ {
+ /*End of tile row*/
+ ps_proc->i4_ctb_slice_y++;
+ ps_proc->i4_ctb_slice_x = 0;
+ }
+ }
+ else
+ {
+ if(ps_proc->i4_ctb_slice_x >= ps_tile->u2_wd)
+ {
+ ps_proc->i4_ctb_slice_y++;
+ ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
+ - ps_tile->u2_wd;
+ }
+ }
+}
+
+void ihevcd_ctb_avail_update(process_ctxt_t *ps_proc)
+{
+ slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+ sps_t *ps_sps = ps_proc->ps_sps;
+ tile_t *ps_tile_prev;
+ tile_t *ps_tile = ps_proc->ps_tile;
+ WORD32 cur_pu_idx;
+ WORD32 tile_start_ctb_idx, slice_start_ctb_idx;
+ WORD16 i2_wd_in_ctb;
+ WORD32 continuous_tiles = 0;
+ WORD32 cur_ctb_idx;
+ WORD32 check_tile_wd;
+
+ if((0 != ps_tile->u1_pos_x) && (0 != ps_tile->u1_pos_y))
+ {
+ ps_tile_prev = ps_tile - 1;
+ }
+ else
+ {
+ ps_tile_prev = ps_tile;
+ }
+
+
+ check_tile_wd = ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd;
+ if(!(((check_tile_wd >= ps_sps->i2_pic_wd_in_ctb) && (check_tile_wd % ps_sps->i2_pic_wd_in_ctb == ps_tile->u1_pos_x))
+ || ((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x))))
+ {
+ continuous_tiles = 1;
+ }
+
+ slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+
+ if((slice_start_ctb_idx < tile_start_ctb_idx) && (continuous_tiles))
+ {
+ //Slices span across multiple tiles.
+ i2_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
+ }
+ else
+ {
+ i2_wd_in_ctb = ps_tile->u2_wd;
+ }
+ cur_ctb_idx = ps_proc->i4_ctb_x
+ + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ /* Ctb level availability */
+ /* Bottom left will not be available at a CTB level, no need to pass this */
+ ps_proc->u1_top_ctb_avail = 1;
+ ps_proc->u1_left_ctb_avail = 1;
+ ps_proc->u1_top_lt_ctb_avail = 1;
+ ps_proc->u1_top_rt_ctb_avail = 1;
+ /* slice and tile boundaries */
+
+ if((0 == ps_proc->i4_ctb_y) || (0 == ps_proc->i4_ctb_tile_y))
+ {
+ ps_proc->u1_top_ctb_avail = 0;
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ ps_proc->u1_top_rt_ctb_avail = 0;
+ }
+
+ if((0 == ps_proc->i4_ctb_x) || (0 == ps_proc->i4_ctb_tile_x))
+ {
+ ps_proc->u1_left_ctb_avail = 0;
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
+ {
+ ps_proc->u1_top_ctb_avail = 0;
+ if((i2_wd_in_ctb - 1) != ps_proc->i4_ctb_slice_x)
+ {
+ ps_proc->u1_top_rt_ctb_avail = 0;
+ }
+ }
+ }
+ /*For slices not beginning at start of a ctb row*/
+ else if(ps_proc->i4_ctb_x > 0)
+ {
+ if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
+ {
+ ps_proc->u1_top_ctb_avail = 0;
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ if(0 == ps_proc->i4_ctb_slice_x)
+ {
+ ps_proc->u1_left_ctb_avail = 0;
+ }
+ if((i2_wd_in_ctb - 1) != ps_proc->i4_ctb_slice_x)
+ {
+ ps_proc->u1_top_rt_ctb_avail = 0;
+ }
+ }
+ else if((1 == ps_proc->i4_ctb_slice_y) && (0 == ps_proc->i4_ctb_slice_x))
+ {
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ }
+ }
+
+ if((ps_proc->i4_ctb_x == (ps_sps->i2_pic_wd_in_ctb - 1)) || ((ps_tile->u2_wd - 1) == ps_proc->i4_ctb_tile_x))
+ {
+ ps_proc->u1_top_rt_ctb_avail = 0;
+ }
+
+
+#if 0
+ if((((0 == ps_proc->i4_ctb_slice_x)
+ && (0 == ps_proc->i4_ctb_slice_y))
+ || (0 == ps_proc->i4_ctb_tile_x)))
+ {
+ ps_proc->u1_left_ctb_avail = 0;
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ }
+ if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
+ {
+ ps_proc->u1_top_ctb_avail = 0;
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ ps_proc->u1_top_rt_ctb_avail = 0;
+ }
+ /* Image boundaries */
+ if(ps_proc->i4_ctb_x == 0)
+ {
+ ps_proc->u1_left_ctb_avail = 0;
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ }
+ if(ps_proc->i4_ctb_x == (ps_sps->i2_pic_wd_in_ctb - 1))
+ {
+ ps_proc->u1_top_rt_ctb_avail = 0;
+ }
+ if(ps_proc->i4_ctb_y == 0)
+ {
+ ps_proc->u1_top_ctb_avail = 0;
+ ps_proc->u1_top_lt_ctb_avail = 0;
+ ps_proc->u1_top_rt_ctb_avail = 0;
+ }
+#endif
+ {
+ WORD32 next_ctb_idx;
+ next_ctb_idx = cur_ctb_idx + 1;
+
+ if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
+ {
+ if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+ {
+ //Last tile
+ if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
+ {
+ next_ctb_idx = cur_ctb_idx + 1;
+ }
+ else //Not last tile, but new tile
+ {
+ tile_t *ps_tile_next = ps_proc->ps_tile + 1;
+ next_ctb_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+ }
+ }
+ else //End of each tile row
+ {
+ next_ctb_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
+ }
+ }
+ ps_proc->i4_next_pu_ctb_cnt = next_ctb_idx;
+ ps_proc->i4_ctb_pu_cnt =
+ ps_proc->pu4_pic_pu_idx[next_ctb_idx]
+ - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+ cur_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+ ps_proc->i4_ctb_start_pu_idx = cur_pu_idx;
+ ps_proc->ps_pu = &ps_proc->ps_pic_pu[cur_pu_idx];
+ }
+}
+
+void ihevcd_update_ctb_tu_cnt(process_ctxt_t *ps_proc)
+{
+ sps_t *ps_sps = ps_proc->ps_sps;
+ codec_t *ps_codec = ps_proc->ps_codec;
+ WORD32 cur_ctb_idx;
+
+ cur_ctb_idx = ps_proc->i4_ctb_x
+ + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+
+ {
+ tile_t *ps_tile;
+ WORD32 next_ctb_tu_idx;
+ ps_tile = ps_proc->ps_tile;
+
+
+ if(1 == ps_codec->i4_num_cores)
+ {
+ next_ctb_tu_idx = cur_ctb_idx % RESET_TU_BUF_NCTB + 1;
+ if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
+ {
+ if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+ {
+ //Last tile
+ if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
+ {
+ next_ctb_tu_idx = (cur_ctb_idx % RESET_TU_BUF_NCTB) + 1;
+ }
+ else //Not last tile, but new tile
+ {
+ tile_t *ps_tile_next = ps_proc->ps_tile + 1;
+ next_ctb_tu_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+ }
+ }
+ else //End of each tile row
+ {
+ next_ctb_tu_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
+ }
+ }
+ ps_proc->i4_next_tu_ctb_cnt = next_ctb_tu_idx;
+ ps_proc->i4_ctb_tu_cnt = ps_proc->pu4_pic_tu_idx[next_ctb_tu_idx] - ps_proc->pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
+ }
+ else
+ {
+ next_ctb_tu_idx = cur_ctb_idx + 1;
+ if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
+ {
+ if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
+ {
+ //Last tile
+ if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
+ {
+ next_ctb_tu_idx = (cur_ctb_idx % RESET_TU_BUF_NCTB) + 1;
+ }
+ else //Not last tile, but new tile
+ {
+ tile_t *ps_tile_next = ps_proc->ps_tile + 1;
+ next_ctb_tu_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+ }
+ }
+ else //End of each tile row
+ {
+ next_ctb_tu_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
+ }
+ }
+ ps_proc->i4_next_tu_ctb_cnt = next_ctb_tu_idx;
+ ps_proc->i4_ctb_tu_cnt = ps_proc->pu4_pic_tu_idx[next_ctb_tu_idx] -
+ ps_proc->pu4_pic_tu_idx[cur_ctb_idx];
+ }
+ }
+}
+
+IHEVCD_ERROR_T ihevcd_process(process_ctxt_t *ps_proc)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ codec_t *ps_codec;
+ sps_t *ps_sps = ps_proc->ps_sps;
+
+ WORD32 nctb;
+ WORD32 i;
+ WORD32 idx;
+ WORD32 nop_cnt;
+ WORD32 num_minpu_in_ctb;
+ WORD32 cur_slice_idx, cur_ctb_tile_x, cur_ctb_slice_x, cur_ctb_tile_y, cur_ctb_slice_y;
+ WORD32 nxt_ctb_slice_y, nxt_ctb_slice_x;
+ tu_t *ps_tu_cur, *ps_tu_nxt;
+ UWORD8 *pu1_pu_map_cur, *pu1_pu_map_nxt;
+ WORD32 num_ctb, num_ctb_tmp;
+ proc_type_t proc_type;
+
+
+ WORD32 ctb_size = 1 << ps_sps->i1_log2_ctb_size;
+
+ PROFILE_DISABLE_PROCESS_CTB();
+
+ ps_codec = ps_proc->ps_codec;
+ num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+
+ nctb = MIN(ps_codec->i4_proc_nctb, ps_proc->i4_ctb_cnt);
+ nctb = MIN(nctb, (ps_proc->ps_tile->u2_wd - ps_proc->i4_ctb_tile_x));
+
+ if(ps_proc->i4_cur_slice_idx > (MAX_SLICE_HDR_CNT - 2 * ps_sps->i2_pic_wd_in_ctb))
+ {
+ num_ctb = 1;
+ }
+ else
+ {
+ num_ctb = ps_proc->i4_nctb;
+ }
+ nxt_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+ nxt_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+ pu1_pu_map_nxt = ps_proc->pu1_pu_map;
+ ps_tu_nxt = ps_proc->ps_tu;
+
+ while(ps_proc->i4_ctb_cnt)
+ {
+ ps_proc->i4_ctb_slice_y = nxt_ctb_slice_y;
+ ps_proc->i4_ctb_slice_x = nxt_ctb_slice_x;
+ ps_proc->pu1_pu_map = pu1_pu_map_nxt;
+ ps_proc->ps_tu = ps_tu_nxt;
+
+ cur_ctb_tile_x = ps_proc->i4_ctb_tile_x;
+ cur_ctb_tile_y = ps_proc->i4_ctb_tile_y;
+ cur_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+ cur_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+ cur_slice_idx = ps_proc->i4_cur_slice_idx;
+ ps_tu_cur = ps_proc->ps_tu;
+ pu1_pu_map_cur = ps_proc->pu1_pu_map;
+ proc_type = PROC_INTER_PRED;
+
+ if(ps_proc->i4_ctb_cnt < num_ctb)
+ {
+ num_ctb = ps_proc->i4_ctb_cnt;
+ }
+#ifdef GPU_BUILD
+ num_ctb = MIN(num_ctb, (ps_proc->ps_tile->u2_wd - ps_proc->i4_ctb_tile_x));
+#endif
+ num_ctb_tmp = num_ctb;
+
+ while(num_ctb_tmp)
+ {
+ slice_header_t *ps_slice_hdr;
+ tile_t *ps_tile = ps_proc->ps_tile;
+
+ /* Waiting for Parsing to be done*/
+ {
+
+
+ nop_cnt = PROC_NOP_CNT;
+ if(ps_proc->i4_check_parse_status || ps_proc->i4_check_proc_status)
+ {
+ while(1)
+ {
+ volatile UWORD8 *pu1_buf;
+ volatile WORD32 status;
+ status = 1;
+#ifdef GPU_BUILD
+ /* If GPU is enabled, don't check for the status of parsing
+ * since processing starts after waiting for MC which means
+ * parsing is done.*/
+ //TODO GPU : Also remove the flag being updated in parsing
+#endif
+ /* Check if all dependencies for the next nCTBs are met */
+#ifndef GPU_BUILD
+ /* Check if the next nCTBs are parsed */
+ if(ps_proc->i4_check_parse_status)
+ {
+ idx = (ps_proc->i4_ctb_x + nctb - 1);
+ idx += (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ pu1_buf = (ps_codec->pu1_parse_map + idx);
+ status = *pu1_buf;
+ }
+#endif
+
+ if(status)
+ break;
+
+ /* if dependencies are not met, then wait for few cycles.
+ * Even after few iterations, if the dependencies are not met then yield
+ */
+ if(nop_cnt > 0)
+ {
+ NOP(128);
+ nop_cnt -= 128;
+ }
+ else
+ {
+ nop_cnt = PROC_NOP_CNT;
+ ithread_yield();
+ }
+ }
+ }
+ }
+
+ /* Check proc map to ensure dependencies for recon are met */
+ ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+ ihevcd_slice_hdr_update(ps_proc);
+ ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+ //ihevcd_mv_prediction();
+ //ihevcd_lvl_unpack();
+ //ihevcd_inter_iq_it_recon();
+ //Following does prediction, iq, it and recon on a TU by TU basis for intra TUs
+ //ihevcd_intra_process();
+ //ihevcd_ctb_boundary_strength_islice(ps_proc, ctb_size);
+ //ihevcd_deblk_ctb(ps_proc);
+
+ /* iq,it recon of Intra TU */
+ {
+ UWORD32 *pu4_ctb_top_pu_idx, *pu4_ctb_left_pu_idx, *pu4_ctb_top_left_pu_idx;
+ WORD32 cur_ctb_idx;
+
+ ihevcd_ctb_avail_update(ps_proc);
+
+#if DEBUG_DUMP_FRAME_BUFFERS_INFO
+ au1_pic_avail_ctb_flags[ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb] =
+ ((ps_proc->u1_top_ctb_avail << 3) | (ps_proc->u1_left_ctb_avail << 2) | (ps_proc->u1_top_lt_ctb_avail << 1) | (ps_proc->u1_top_rt_ctb_avail));
+ au4_pic_ctb_slice_xy[ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb] =
+ (((UWORD16)ps_proc->i4_ctb_slice_x << 16) | ((UWORD16)ps_proc->i4_ctb_slice_y << 16));
+#endif
+
+ /*************************************************/
+ /**************** MV pred **********************/
+ /*************************************************/
+ if(PSLICE == ps_slice_hdr->i1_slice_type
+ || BSLICE == ps_slice_hdr->i1_slice_type)
+ {
+ mv_ctxt_t s_mv_ctxt;
+
+ pu4_ctb_top_pu_idx = ps_proc->pu4_pic_pu_idx_top
+ + (ps_proc->i4_ctb_x * ctb_size / MIN_PU_SIZE);
+ pu4_ctb_left_pu_idx = ps_proc->pu4_pic_pu_idx_left;
+ pu4_ctb_top_left_pu_idx = &ps_proc->u4_ctb_top_left_pu_idx;
+
+ /* Initializing s_mv_ctxt */
+ if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
+ {
+ s_mv_ctxt.ps_pps = ps_proc->ps_pps;
+ s_mv_ctxt.ps_sps = ps_proc->ps_sps;
+ s_mv_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+ s_mv_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+ s_mv_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+ s_mv_ctxt.ps_pu = ps_proc->ps_pu;
+ s_mv_ctxt.ps_pic_pu = ps_proc->ps_pic_pu;
+ s_mv_ctxt.ps_tile = ps_tile;
+ s_mv_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+ s_mv_ctxt.pu4_pic_pu_idx = ps_proc->pu4_pic_pu_idx;
+ s_mv_ctxt.pu1_pic_pu_map = ps_proc->pu1_pic_pu_map;
+ s_mv_ctxt.i4_ctb_pu_cnt = ps_proc->i4_ctb_pu_cnt;
+ s_mv_ctxt.i4_ctb_start_pu_idx = ps_proc->i4_ctb_start_pu_idx;
+ s_mv_ctxt.u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
+ s_mv_ctxt.u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;
+ s_mv_ctxt.u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
+ s_mv_ctxt.u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
+
+ ihevcd_get_mv_ctb(&s_mv_ctxt, pu4_ctb_top_pu_idx,
+ pu4_ctb_left_pu_idx, pu4_ctb_top_left_pu_idx);
+ }
+
+ ihevcd_inter_pred_ctb(ps_proc);
+ }
+ else if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
+ {
+ WORD32 next_ctb_idx, num_pu_per_ctb, ctb_start_pu_idx, pu_cnt;
+ pu_t *ps_pu;
+ WORD32 num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+ UWORD8 *pu1_pic_pu_map_ctb = ps_proc->pu1_pic_pu_map +
+ (ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb) * num_minpu_in_ctb;
+ WORD32 row, col;
+ UWORD32 *pu4_nbr_pu_idx = ps_proc->pu4_pic_pu_idx_map;
+ WORD32 nbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
+
+ for(row = 0; row < ctb_size / MIN_PU_SIZE; row++)
+ {
+ for(col = 0; col < ctb_size / MIN_PU_SIZE; col++)
+ {
+ pu1_pic_pu_map_ctb[row * ctb_size / MIN_PU_SIZE + col] = 0;
+ }
+ }
+ /* Neighbor PU idx update inside CTB */
+ /* 1byte per 4x4. Indicates the PU idx that 4x4 block belongs to */
+
+ cur_ctb_idx = ps_proc->i4_ctb_x
+ + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ next_ctb_idx = ps_proc->i4_next_pu_ctb_cnt;
+ num_pu_per_ctb = ps_proc->pu4_pic_pu_idx[next_ctb_idx]
+ - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+ ctb_start_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
+ ps_pu = &ps_proc->ps_pic_pu[ctb_start_pu_idx];
+
+ for(pu_cnt = 0; pu_cnt < num_pu_per_ctb; pu_cnt++, ps_pu++)
+ {
+ UWORD32 cur_pu_idx;
+ WORD32 pu_ht = (ps_pu->b4_ht + 1) << 2;
+ WORD32 pu_wd = (ps_pu->b4_wd + 1) << 2;
+
+ cur_pu_idx = ctb_start_pu_idx + pu_cnt;
+
+ for(row = 0; row < pu_ht / MIN_PU_SIZE; row++)
+ for(col = 0; col < pu_wd / MIN_PU_SIZE; col++)
+ pu4_nbr_pu_idx[(1 + ps_pu->b4_pos_x + col)
+ + (1 + ps_pu->b4_pos_y + row)
+ * nbr_pu_idx_strd] =
+ cur_pu_idx;
+ }
+
+ /* Updating Top and Left pointers */
+ {
+ WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
+ - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
+ WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
+
+ /* Top Left */
+ /* saving top left before updating top ptr, as updating top ptr will overwrite the top left for the next ctb */
+ ps_proc->u4_ctb_top_left_pu_idx = ps_proc->pu4_pic_pu_idx_top[((ps_proc->i4_ctb_x + 1) * ctb_size / MIN_PU_SIZE) - 1];
+ for(i = 0; i < ctb_size / MIN_PU_SIZE; i++)
+ {
+ /* Left */
+ /* Last column of au4_nbr_pu_idx */
+ ps_proc->pu4_pic_pu_idx_left[i] =
+ pu4_nbr_pu_idx[(ctb_size / MIN_PU_SIZE) + (i + 1) * nbr_pu_idx_strd];
+ /* Top */
+ /* Last row of au4_nbr_pu_idx */
+ ps_proc->pu4_pic_pu_idx_top[(ps_proc->i4_ctb_x * ctb_size / MIN_PU_SIZE) + i] =
+ pu4_nbr_pu_idx[(ctb_size_left / MIN_PU_SIZE) * nbr_pu_idx_strd + i + 1];
+
+ }
+ }
+ }
+ }
+
+ if(ps_proc->ps_pps->i1_tiles_enabled_flag)
+ {
+ /*Update the tile index buffer with tile information for the current ctb*/
+ UWORD16 *pu1_tile_idx = ps_proc->pu1_tile_idx;
+ pu1_tile_idx[(ps_proc->i4_ctb_x + (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb))]
+ = ps_proc->i4_cur_tile_idx;
+ }
+
+ /*************************************************/
+ /*********** BS, QP and Deblocking **************/
+ /*************************************************/
+ /* Boundary strength call has to be after IQ IT recon since QP population needs ps_proc->i4_qp_const_inc_ctb flag */
+
+ {
+ slice_header_t *ps_slice_hdr;
+ ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+
+ /* Check if deblock is disabled for the current slice or if it is disabled for the current picture
+ * because of disable deblock api
+ */
+ if(0 == ps_codec->i4_disable_deblk_pic)
+ {
+ if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
+ {
+ if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
+ (0 == ps_codec->i4_slice_error))
+ {
+ ihevcd_update_ctb_tu_cnt(ps_proc);
+ ps_proc->s_bs_ctxt.ps_pps = ps_proc->ps_pps;
+ ps_proc->s_bs_ctxt.ps_sps = ps_proc->ps_sps;
+ ps_proc->s_bs_ctxt.ps_codec = ps_proc->ps_codec;
+ ps_proc->s_bs_ctxt.i4_ctb_tu_cnt = ps_proc->i4_ctb_tu_cnt;
+ ps_proc->s_bs_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+ ps_proc->s_bs_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+ ps_proc->s_bs_ctxt.i4_ctb_tile_x = ps_proc->i4_ctb_tile_x;
+ ps_proc->s_bs_ctxt.i4_ctb_tile_y = ps_proc->i4_ctb_tile_y;
+ ps_proc->s_bs_ctxt.i4_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+ ps_proc->s_bs_ctxt.i4_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+ ps_proc->s_bs_ctxt.ps_tu = ps_proc->ps_tu;
+ ps_proc->s_bs_ctxt.ps_pu = ps_proc->ps_pu;
+ ps_proc->s_bs_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
+ ps_proc->s_bs_ctxt.i4_next_pu_ctb_cnt = ps_proc->i4_next_pu_ctb_cnt;
+ ps_proc->s_bs_ctxt.i4_next_tu_ctb_cnt = ps_proc->i4_next_tu_ctb_cnt;
+ ps_proc->s_bs_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
+ ps_proc->s_bs_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+ ps_proc->s_bs_ctxt.ps_tile = ps_proc->ps_tile;
+
+ if(ISLICE == ps_slice_hdr->i1_slice_type)
+ {
+ ihevcd_ctb_boundary_strength_islice(&ps_proc->s_bs_ctxt);
+ }
+ else
+ {
+ ihevcd_ctb_boundary_strength_pbslice(&ps_proc->s_bs_ctxt);
+ }
+ }
+ else
+ {
+ WORD32 vert_bs_strd = ps_sps->i2_pic_wd_in_ctb * (ctb_size * ctb_size / 8 / 16);
+ WORD32 horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16);
+ UWORD32 *pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_proc->s_bs_ctxt.pu4_pic_vert_bs +
+ ps_proc->i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+ ps_proc->i4_ctb_y * vert_bs_strd);
+ UWORD32 *pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_proc->s_bs_ctxt.pu4_pic_horz_bs +
+ ps_proc->i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
+ ps_proc->i4_ctb_y * horz_bs_strd);
+
+ memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2);
+ memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
+
+ }
+ }
+ }
+ }
+
+ /* Per CTB update the following */
+ {
+ WORD32 cur_ctb_idx = ps_proc->i4_ctb_x
+ + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ cur_ctb_idx++;
+
+ ps_proc->pu1_pu_map += nctb * num_minpu_in_ctb;
+ ps_proc->ps_tu += ps_proc->i4_ctb_tu_cnt;
+ if((1 == ps_codec->i4_num_cores) &&
+ (0 == cur_ctb_idx % RESET_TU_BUF_NCTB))
+ {
+ ps_proc->ps_tu = ps_proc->ps_pic_tu;
+ }
+ ps_proc->ps_pu += ps_proc->i4_ctb_pu_cnt;
+ }
+
+ /* Update proc map for recon*/
+ ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+
+ num_ctb_tmp -= nctb;
+ ihevcd_ctb_pos_update(ps_proc, nctb);
+
+ }
+
+ if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
+ {
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#else
+ ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#endif
+ ps_proc->i4_cur_slice_idx = cur_slice_idx;
+ }
+ /* Restore the saved variables */
+ num_ctb_tmp = num_ctb;
+ ps_proc->i4_ctb_x -= num_ctb;
+ ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
+ ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
+ ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
+ ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
+ ps_proc->pu1_pu_map = pu1_pu_map_cur;
+ ps_proc->ps_tu = ps_tu_cur;
+ proc_type = PROC_RECON;
+
+ while(num_ctb_tmp)
+ {
+
+ /* Check proc map to ensure dependencies for recon are met */
+ ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+ ihevcd_slice_hdr_update(ps_proc);
+
+ {
+
+ ihevcd_ctb_avail_update(ps_proc);
+
+ /*************************************************/
+ /**************** IQ IT RECON *******************/
+ /*************************************************/
+
+ ihevcd_update_ctb_tu_cnt(ps_proc);
+
+ /* When scaling matrix is not to be used(scaling_list_enable_flag is zero in SPS),
+ * default value of 16 has to be used. Since the value is same for all sizes,
+ * same table is used for all cases.
+ */
+ if(0 == ps_sps->i1_scaling_list_enable_flag)
+ {
+ ps_proc->api2_dequant_intra_matrix[0] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_intra_matrix[1] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_intra_matrix[2] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_intra_matrix[3] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+
+ ps_proc->api2_dequant_inter_matrix[0] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_inter_matrix[1] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_inter_matrix[2] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_inter_matrix[3] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ }
+ else
+ {
+ if(0 == ps_sps->i1_sps_scaling_list_data_present_flag)
+ {
+ ps_proc->api2_dequant_intra_matrix[0] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_intra_matrix[1] =
+ (WORD16 *)gi2_intra_default_scale_mat_8x8;
+ ps_proc->api2_dequant_intra_matrix[2] =
+ (WORD16 *)gi2_intra_default_scale_mat_16x16;
+ ps_proc->api2_dequant_intra_matrix[3] =
+ (WORD16 *)gi2_intra_default_scale_mat_32x32;
+
+ ps_proc->api2_dequant_inter_matrix[0] =
+ (WORD16 *)gi2_flat_scale_mat_32x32;
+ ps_proc->api2_dequant_inter_matrix[1] =
+ (WORD16 *)gi2_inter_default_scale_mat_8x8;
+ ps_proc->api2_dequant_inter_matrix[2] =
+ (WORD16 *)gi2_inter_default_scale_mat_16x16;
+ ps_proc->api2_dequant_inter_matrix[3] =
+ (WORD16 *)gi2_inter_default_scale_mat_32x32;
+ }
+ /*TODO: Add support for custom scaling matrices */
+ }
+
+
+ /* CTB Level pointers */
+ ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
+ + (ps_proc->i4_ctb_x * ctb_size
+ + ps_proc->i4_ctb_y * ctb_size
+ * ps_codec->i4_strd);
+ ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
+ + ps_proc->i4_ctb_x * ctb_size
+ + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+#if DEBUG_PRINT_IQ_IT_RECON
+ printf("\nCTB x=%d, y=%d", ps_proc->i4_ctb_x, ps_proc->i4_ctb_y);
+ printf("\n CTB size= %d,CTB level availability: L=%d,TL=%d,TR=%d,T=%d",
+ ctb_size, ps_proc->u1_left_ctb_avail, ps_proc->u1_top_lt_ctb_avail, ps_proc->u1_top_rt_ctb_avail,
+ ps_proc->u1_top_ctb_avail);
+#endif
+
+ ihevcd_iquant_itrans_recon_ctb(ps_proc);
+ }
+
+ /* Per CTB update the following */
+ {
+ WORD32 cur_ctb_idx = ps_proc->i4_ctb_x
+ + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
+ cur_ctb_idx++;
+
+ ps_proc->pu1_pu_map += nctb * num_minpu_in_ctb;
+ ps_proc->ps_tu += ps_proc->i4_ctb_tu_cnt;
+ if((1 == ps_codec->i4_num_cores) &&
+ (0 == cur_ctb_idx % RESET_TU_BUF_NCTB))
+ {
+ ps_proc->ps_tu = ps_proc->ps_pic_tu;
+ }
+ ps_proc->ps_pu += ps_proc->i4_ctb_pu_cnt;
+ }
+
+
+ /* Update proc map for recon*/
+ ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+
+ num_ctb_tmp -= nctb;
+ ihevcd_ctb_pos_update(ps_proc, nctb);
+ }
+
+ if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
+ {
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#else
+ ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#endif
+ ps_proc->i4_cur_slice_idx = cur_slice_idx;
+ }
+ /* Restore the saved variables */
+ num_ctb_tmp = num_ctb;
+ ps_proc->i4_ctb_x -= num_ctb;
+ ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
+ ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
+ ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
+ ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
+ pu1_pu_map_nxt = ps_proc->pu1_pu_map;
+ ps_tu_nxt = ps_proc->ps_tu;
+ ps_proc->pu1_pu_map = pu1_pu_map_cur;
+ ps_proc->ps_tu = ps_tu_cur;
+ proc_type = PROC_DEBLK;
+
+ while(num_ctb_tmp)
+ {
+ slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+ /* Check proc map to ensure dependencies for deblk are met */
+ ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+ ihevcd_slice_hdr_update(ps_proc);
+ ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+ if(((0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)) &&
+ (0 == ps_codec->i4_disable_deblk_pic))
+ {
+ WORD32 i4_is_last_ctb_x = 0;
+ WORD32 i4_is_last_ctb_y = 0;
+
+ if(0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag ||
+ (ps_proc->i4_ctb_slice_x == 0) ||
+ (ps_proc->i4_ctb_slice_y == 0))
+ {
+ ps_proc->s_deblk_ctxt.ps_pps = ps_proc->ps_pps;
+ ps_proc->s_deblk_ctxt.ps_sps = ps_proc->ps_sps;
+ ps_proc->s_deblk_ctxt.ps_codec = ps_proc->ps_codec;
+ ps_proc->s_deblk_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+ ps_proc->s_deblk_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+ ps_proc->s_deblk_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+ ps_proc->s_deblk_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
+ ps_proc->s_deblk_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+ /* Populating Current CTB's no_loop_filter flags */
+ {
+ WORD32 row;
+ WORD32 log2_ctb_size = ps_sps->i1_log2_ctb_size;
+
+ /* Loop filter strd in units of num bits */
+ WORD32 loop_filter_strd = ((ps_sps->i2_pic_width_in_luma_samples + 63) >> 6) << 3;
+ /* Bit position is the current 8x8 bit offset wrt pic_no_loop_filter
+ * bit_pos has to be a WOR32 so that when it is negative, the downshift still retains it to be a negative value */
+ WORD32 bit_pos = ((ps_proc->i4_ctb_y << (log2_ctb_size - 3)) - 1) * loop_filter_strd + (ps_proc->i4_ctb_x << (log2_ctb_size - 3)) - 1;
+
+ for(row = 0; row < (ctb_size >> 3) + 1; row++)
+ {
+ /* Go to the corresponding byte - read 32 bits and downshift */
+ ps_proc->s_deblk_ctxt.au2_ctb_no_loop_filter_flag[row] = (*(UWORD32 *)(ps_proc->pu1_pic_no_loop_filter_flag + (bit_pos >> 3))) >> (bit_pos & 7);
+ bit_pos += loop_filter_strd;
+ }
+ }
+
+ ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+
+ /* If the last CTB in the row was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+ * is applied on a shifted CTB structure
+ */
+ if(ps_proc->i4_ctb_x == ps_sps->i2_pic_wd_in_ctb - 1)
+ {
+ WORD32 i4_is_last_ctb_x = 1;
+ WORD32 i4_is_last_ctb_y = 0;
+
+ WORD32 last_x_pos;
+ last_x_pos = (ps_sps->i2_pic_wd_in_ctb << ps_sps->i1_log2_ctb_size);
+ if(last_x_pos == ps_sps->i2_pic_width_in_luma_samples)
+ {
+ ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+ }
+ }
+
+
+ /* If the last CTB in the column was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
+ * is applied on a shifted CTB structure
+ */
+ if(ps_proc->i4_ctb_y == ps_sps->i2_pic_ht_in_ctb - 1)
+ {
+ WORD32 i4_is_last_ctb_x = 0;
+ WORD32 i4_is_last_ctb_y = 1;
+ WORD32 last_y_pos;
+ last_y_pos = (ps_sps->i2_pic_ht_in_ctb << ps_sps->i1_log2_ctb_size);
+ if(last_y_pos == ps_sps->i2_pic_height_in_luma_samples)
+ {
+ ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
+ }
+ }
+ }
+ }
+
+ /* Update proc map for deblk*/
+ ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+
+ num_ctb_tmp -= nctb;
+ ihevcd_ctb_pos_update(ps_proc, nctb);
+ }
+
+ if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
+ {
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#else
+ ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
+#endif
+ ps_proc->i4_cur_slice_idx = cur_slice_idx;
+ }
+ /* Restore the saved variables */
+ num_ctb_tmp = num_ctb;
+ ps_proc->i4_ctb_x -= num_ctb;
+ ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
+ ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
+ ps_proc->pu1_pu_map = pu1_pu_map_cur;
+ ps_proc->ps_tu = ps_tu_cur;
+ nxt_ctb_slice_y = ps_proc->i4_ctb_slice_y;
+ nxt_ctb_slice_x = ps_proc->i4_ctb_slice_x;
+ ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
+ ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
+ proc_type = PROC_SAO;
+
+ while(num_ctb_tmp)
+ {
+ slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+ /* Check proc map to ensure dependencies for SAO are met */
+ ihevcd_proc_map_check(ps_proc, proc_type, nctb);
+
+ ihevcd_slice_hdr_update(ps_proc);
+ ps_slice_hdr = ps_proc->ps_slice_hdr;
+
+ if(0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)
+ {
+ if((0 == ps_codec->i4_disable_sao_pic) &&
+ (ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag))
+ {
+ ps_proc->s_sao_ctxt.ps_pps = ps_proc->ps_pps;
+ ps_proc->s_sao_ctxt.ps_sps = ps_proc->ps_sps;
+ ps_proc->s_sao_ctxt.ps_tile = ps_proc->ps_tile;
+ ps_proc->s_sao_ctxt.ps_codec = ps_proc->ps_codec;
+ ps_proc->s_sao_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
+ ps_proc->s_sao_ctxt.i4_cur_slice_idx = ps_proc->i4_cur_slice_idx;
+
+
+#if SAO_PROCESS_SHIFT_CTB
+ ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+ ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+ ps_proc->s_sao_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
+
+ ihevcd_sao_shift_ctb(&ps_proc->s_sao_ctxt);
+#else
+ if(ps_proc->i4_ctb_x > 1 && ps_proc->i4_ctb_y > 0)
+ {
+ ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x - 2;
+ ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
+
+ ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+ }
+
+ if(ps_sps->i2_pic_wd_in_ctb - 1 == ps_proc->i4_ctb_x && ps_proc->i4_ctb_y > 0)
+ {
+ ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x - 1;
+ ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
+
+ ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+
+ ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
+ ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
+
+ ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+
+ if(ps_sps->i2_pic_ht_in_ctb - 1 == ps_proc->i4_ctb_y)
+ {
+ WORD32 i4_ctb_x;
+ ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
+ for(i4_ctb_x = 0; i4_ctb_x < ps_sps->i2_pic_wd_in_ctb; i4_ctb_x++)
+ {
+ ps_proc->s_sao_ctxt.i4_ctb_x = i4_ctb_x;
+ ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
+ }
+ }
+ }
+#endif
+ }
+
+
+ /* Call padding if required */
+ {
+#if SAO_PROCESS_SHIFT_CTB
+
+ if(0 == ps_proc->i4_ctb_x)
+ {
+ WORD32 pad_ht_luma;
+ WORD32 pad_ht_chroma;
+
+ ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
+ + (ps_proc->i4_ctb_x * ctb_size
+ + ps_proc->i4_ctb_y * ctb_size
+ * ps_codec->i4_strd);
+ ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
+ + ps_proc->i4_ctb_x * ctb_size
+ + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+
+ pad_ht_luma = ctb_size;
+ pad_ht_luma += (ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y ? 8 : 0;
+ pad_ht_chroma = ctb_size / 2;
+ /* Pad left after 1st CTB is processed */
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_ctb_luma - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_ctb_chroma - 16 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+ }
+
+ if((ps_sps->i2_pic_wd_in_ctb - 1) == ps_proc->i4_ctb_x)
+ {
+ WORD32 pad_ht_luma;
+ WORD32 pad_ht_chroma;
+ WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+
+ ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
+ + (ps_proc->i4_ctb_x * ctb_size
+ + ps_proc->i4_ctb_y * ctb_size
+ * ps_codec->i4_strd);
+ ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
+ + ps_proc->i4_ctb_x * ctb_size
+ + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
+
+ pad_ht_luma = ctb_size;
+ pad_ht_chroma = ctb_size / 2;
+ if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
+ {
+ pad_ht_luma += 8;
+ pad_ht_chroma += 16;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_pic_chroma + (ps_sps->i2_pic_height_in_luma_samples / 2 - 16) * ps_codec->i4_strd,
+ ps_codec->i4_strd, 16, PAD_LEFT);
+ }
+ /* Pad right after last CTB in the current row is processed */
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(ps_proc->pu1_cur_ctb_luma + cols_remaining - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(ps_proc->pu1_cur_ctb_chroma + cols_remaining - 16 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
+
+ if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
+ {
+ UWORD8 *pu1_buf;
+ /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
+ /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
+ /* Pad top after padding left and right for current rows after processing 1st CTB row */
+ ihevc_pad_top(ps_proc->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
+ ihevc_pad_top(ps_proc->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
+
+ pu1_buf = ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
+ /* Pad top after padding left and right for current rows after processing 1st CTB row */
+ ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
+
+ pu1_buf = ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
+ ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
+ }
+ }
+#else
+ if(ps_proc->i4_ctb_y > 1)
+ {
+ if(0 == ps_proc->i4_ctb_x)
+ {
+ WORD32 pad_ht_luma;
+ WORD32 pad_ht_chroma;
+
+ pad_ht_luma = ctb_size;
+ pad_ht_chroma = ctb_size / 2;
+ /* Pad left after 1st CTB is processed */
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_ctb_luma - 2 * ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_ctb_chroma - ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+ }
+ else if((ps_sps->i2_pic_wd_in_ctb - 1) == ps_proc->i4_ctb_x)
+ {
+ WORD32 pad_ht_luma;
+ WORD32 pad_ht_chroma;
+ WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
+
+ pad_ht_luma = ((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y) ? 3 * ctb_size : ctb_size;
+ pad_ht_chroma = ((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y) ? 3 * ctb_size / 2 : ctb_size / 2;
+ /* Pad right after last CTB in the current row is processed */
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(ps_proc->pu1_cur_ctb_luma + cols_remaining - 2 * ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(ps_proc->pu1_cur_ctb_chroma + cols_remaining - ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
+
+ if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
+ {
+ UWORD8 *pu1_buf;
+ WORD32 pad_ht_luma;
+ WORD32 pad_ht_chroma;
+
+ pad_ht_luma = 2 * ctb_size;
+ pad_ht_chroma = ctb_size;
+
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples - 2 * ctb_size),
+ ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2 - ctb_size),
+ ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
+
+ /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
+ /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
+ /* Pad top after padding left and right for current rows after processing 1st CTB row */
+ ihevc_pad_top(ps_proc->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
+ ihevc_pad_top(ps_proc->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
+
+ pu1_buf = ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
+ /* Pad top after padding left and right for current rows after processing 1st CTB row */
+ ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
+
+ pu1_buf = ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
+ ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
+ }
+ }
+ }
+#endif
+ }
+ }
+
+
+ /* Update proc map for SAO*/
+ ihevcd_proc_map_update(ps_proc, proc_type, nctb);
+ /* Update proc map for Completion of CTB*/
+ ihevcd_proc_map_update(ps_proc, PROC_ALL, nctb);
+ {
+ tile_t *ps_tile;
+
+ ps_tile = ps_proc->ps_tile;
+ num_ctb_tmp -= nctb;
+
+ ps_proc->i4_ctb_tile_x += nctb;
+ ps_proc->i4_ctb_x += nctb;
+
+ ps_proc->i4_ctb_slice_x += nctb;
+
+
+ /* Update tile counters */
+ if(ps_proc->i4_ctb_tile_x >= (ps_tile->u2_wd))
+ {
+ /*End of tile row*/
+ ps_proc->i4_ctb_tile_x = 0;
+ ps_proc->i4_ctb_x = ps_tile->u1_pos_x;
+
+ ps_proc->i4_ctb_tile_y++;
+ ps_proc->i4_ctb_y++;
+ if(ps_proc->i4_ctb_tile_y == ps_tile->u2_ht)
+ {
+ /* Reached End of Tile */
+ ps_proc->i4_ctb_tile_y = 0;
+ ps_proc->i4_ctb_tile_x = 0;
+ ps_proc->ps_tile++;
+ //End of picture
+ if(!((ps_tile->u2_ht + ps_tile->u1_pos_y == ps_sps->i2_pic_ht_in_ctb) && (ps_tile->u2_wd + ps_tile->u1_pos_x == ps_sps->i2_pic_wd_in_ctb)))
+ {
+ ps_tile = ps_proc->ps_tile;
+ ps_proc->i4_ctb_x = ps_tile->u1_pos_x;
+ ps_proc->i4_ctb_y = ps_tile->u1_pos_y;
+
+ }
+ }
+ }
+ }
+ }
+
+ ps_proc->i4_ctb_cnt -= num_ctb;
+ }
+ return ret;
+}
+
+void ihevcd_init_proc_ctxt(process_ctxt_t *ps_proc, WORD32 tu_coeff_data_ofst)
+{
+ codec_t *ps_codec;
+ slice_header_t *ps_slice_hdr;
+ pps_t *ps_pps;
+ sps_t *ps_sps;
+ tile_t *ps_tile, *ps_tile_prev;
+ WORD32 tile_idx;
+ WORD32 ctb_size;
+ WORD32 num_minpu_in_ctb;
+ WORD32 num_ctb_in_row;
+ WORD32 ctb_addr;
+ WORD32 i4_wd_in_ctb;
+ WORD32 tile_start_ctb_idx;
+ WORD32 slice_start_ctb_idx;
+ WORD32 check_tile_wd;
+ WORD32 continuous_tiles = 0; //Refers to tiles that are continuous, within a slice, horizontally
+
+ ps_codec = ps_proc->ps_codec;
+
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx) & (MAX_SLICE_HDR_CNT - 1));
+#else
+ ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx) & (MAX_SLICE_HDR_CNT - 1));
+#endif
+ ps_proc->ps_slice_hdr = ps_slice_hdr;
+ ps_proc->ps_pps = ps_codec->ps_pps_base + ps_slice_hdr->i1_pps_id;
+ ps_pps = ps_proc->ps_pps;
+ ps_proc->ps_sps = ps_codec->ps_sps_base + ps_pps->i1_sps_id;
+ ps_sps = ps_proc->ps_sps;
+ ps_proc->i4_init_done = 1;
+ ctb_size = 1 << ps_sps->i1_log2_ctb_size;
+ num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
+ num_ctb_in_row = ps_sps->i2_pic_wd_in_ctb;
+
+ ps_proc->s_sao_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
+
+ ihevcd_get_tile_pos(ps_pps, ps_sps, ps_proc->i4_ctb_x, ps_proc->i4_ctb_y,
+ &ps_proc->i4_ctb_tile_x, &ps_proc->i4_ctb_tile_y,
+ &tile_idx);
+
+ ps_proc->ps_tile = ps_pps->ps_tile + tile_idx;
+ ps_proc->i4_cur_tile_idx = tile_idx;
+ ps_tile = ps_proc->ps_tile;
+
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ if(tile_idx)
+ ps_tile_prev = ps_tile - 1;
+ else
+ ps_tile_prev = ps_tile;
+
+ slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
+ tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
+
+ /*Check if
+ * 1. Last tile that ends in frame boundary and 1st tile in next row belongs to same slice
+ * 1.1. If it does, check if the slice that has these tiles spans across the frame row.
+ * 2. Vertical tiles are present within a slice */
+ if(((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x) && (ps_slice_hdr->i2_ctb_y != ps_tile->u1_pos_y)))
+ {
+ continuous_tiles = 1;
+ }
+ else
+ {
+ check_tile_wd = ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd;
+ if(!(((check_tile_wd >= ps_sps->i2_pic_wd_in_ctb) && (check_tile_wd % ps_sps->i2_pic_wd_in_ctb == ps_tile->u1_pos_x))
+ || ((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x))))
+ {
+ continuous_tiles = 1;
+ }
+ }
+
+ {
+ WORD32 i2_independent_ctb_x = ps_slice_hdr->i2_independent_ctb_x;
+ WORD32 i2_independent_ctb_y = ps_slice_hdr->i2_independent_ctb_y;
+
+ /* Handles cases where
+ * 1. Slices begin at the start of each tile
+ * 2. Tiles lie in the same slice row.i.e, starting tile_x > slice_x, but tile_y == slice_y
+ * */
+ if(ps_proc->i4_ctb_x >= i2_independent_ctb_x)
+ {
+ ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - i2_independent_ctb_x;
+ }
+ else
+ {
+ /* Indicates multiple tiles in a slice case where
+ * The new tile belongs to an older slice that started in the previous rows-not the present row
+ * & (tile_y > slice_y and tile_x < slice_x)
+ */
+ if((slice_start_ctb_idx < tile_start_ctb_idx) && (continuous_tiles))
+ {
+ i4_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
+ }
+ /* Indicates many-tiles-in-one-slice case, for slices that end without spanning the frame width*/
+ else
+ {
+ i4_wd_in_ctb = ps_tile->u2_wd;
+ }
+
+ if(continuous_tiles)
+ {
+ ps_proc->i4_ctb_slice_x = i4_wd_in_ctb
+ - (i2_independent_ctb_x - ps_proc->i4_ctb_x);
+ }
+ else
+ {
+ ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - ps_tile->u1_pos_x;
+ }
+ }
+ /* Initialize ctb slice y to zero and at the start of slice row initialize it
+ to difference between ctb_y and slice's start ctb y */
+
+ ps_proc->i4_ctb_slice_y = ps_proc->i4_ctb_y - i2_independent_ctb_y;
+
+ /*If beginning of tile, check if slice counters are set correctly*/
+ if((0 == ps_proc->i4_ctb_tile_x) && (0 == ps_proc->i4_ctb_tile_y))
+ {
+ if(ps_slice_hdr->i1_dependent_slice_flag)
+ {
+ ps_proc->i4_ctb_slice_x = 0;
+ ps_proc->i4_ctb_slice_y = 0;
+ }
+ /*For slices that span across multiple tiles*/
+ else if(slice_start_ctb_idx < tile_start_ctb_idx)
+ {
+ ps_proc->i4_ctb_slice_y = ps_tile->u1_pos_y - i2_independent_ctb_y;
+ /* Two Cases
+ * 1 - slice spans across frame-width- but dose not start from 1st column
+ * 2 - Slice spans across multiple tiles anywhere is a frame
+ */
+ /*TODO:In a multiple slice clip, if an independent slice span across more than 2 tiles in a row, it is not supported*/
+ if(continuous_tiles) //Case 2-implemented for slices that span not more than 2 tiles
+ {
+ if(i2_independent_ctb_y <= ps_tile->u1_pos_y)
+ {
+ //Check if ctb x is before or after
+ if(i2_independent_ctb_x > ps_tile->u1_pos_x)
+ {
+ ps_proc->i4_ctb_slice_y -= 1;
+ }
+ }
+ }
+ }
+ }
+ //Slice starts from a column which is not the starting tile-column, but is within the tile
+ if(((i2_independent_ctb_x - ps_tile->u1_pos_x) != 0) && ((ps_proc->i4_ctb_slice_y != 0))
+ && ((i2_independent_ctb_x >= ps_tile->u1_pos_x) && (i2_independent_ctb_x < ps_tile->u1_pos_x + ps_tile->u2_wd)))
+ {
+ ps_proc->i4_ctb_slice_y -= 1;
+ }
+ }
+ }
+ else
+ {
+ WORD32 i2_independent_ctb_x = ps_slice_hdr->i2_independent_ctb_x;
+ WORD32 i2_independent_ctb_y = ps_slice_hdr->i2_independent_ctb_y;
+
+
+ {
+ ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - i2_independent_ctb_x;
+ ps_proc->i4_ctb_slice_y = ps_proc->i4_ctb_y - i2_independent_ctb_y;
+ if(ps_proc->i4_ctb_slice_x < 0)
+ {
+ ps_proc->i4_ctb_slice_x += ps_sps->i2_pic_wd_in_ctb;
+ ps_proc->i4_ctb_slice_y -= 1;
+ }
+
+ /* Initialize ctb slice y to zero and at the start of slice row initialize it
+ to difference between ctb_y and slice's start ctb y */
+ }
+ }
+
+ /* Compute TU offset for the current CTB set */
+ {
+
+ WORD32 ctb_luma_min_tu_cnt;
+ WORD32 ctb_addr;
+
+ ctb_addr = ps_proc->i4_ctb_y * num_ctb_in_row + ps_proc->i4_ctb_x;
+
+ ctb_luma_min_tu_cnt = (1 << ps_sps->i1_log2_ctb_size) / MIN_TU_SIZE;
+ ctb_luma_min_tu_cnt *= ctb_luma_min_tu_cnt;
+
+ ps_proc->pu1_tu_map = ps_proc->pu1_pic_tu_map
+ + ctb_luma_min_tu_cnt * ctb_addr;
+ if(1 == ps_codec->i4_num_cores)
+ {
+ ps_proc->ps_tu = ps_proc->ps_pic_tu + ps_proc->pu4_pic_tu_idx[ctb_addr % RESET_TU_BUF_NCTB];
+ }
+ else
+ {
+ ps_proc->ps_tu = ps_proc->ps_pic_tu + ps_proc->pu4_pic_tu_idx[ctb_addr];
+ }
+ ps_proc->pv_tu_coeff_data = (UWORD8 *)ps_proc->pv_pic_tu_coeff_data
+ + tu_coeff_data_ofst;
+
+ }
+
+ /* Compute PU related elements for the current CTB set */
+ {
+ WORD32 pu_idx;
+ ctb_addr = ps_proc->i4_ctb_y * num_ctb_in_row + ps_proc->i4_ctb_x;
+ pu_idx = ps_proc->pu4_pic_pu_idx[ctb_addr];
+ ps_proc->pu1_pu_map = ps_proc->pu1_pic_pu_map
+ + ctb_addr * num_minpu_in_ctb;
+ ps_proc->ps_pu = ps_proc->ps_pic_pu + pu_idx;
+ }
+
+ /* Number of ctbs processed in one loop of process function */
+ {
+ ps_proc->i4_nctb = MIN(ps_codec->u4_nctb, ps_tile->u2_wd);
+ }
+
+}
+void ihevcd_process_thread(process_ctxt_t *ps_proc)
+{
+#ifdef GPU_BUILD
+ codec_t *ps_codec = ps_proc->ps_codec;
+#endif
+ {
+ ithread_set_affinity(ps_proc->i4_id + 1);
+ }
+ while(1)
+ {
+ IHEVCD_ERROR_T ret;
+ proc_job_t s_job;
+
+ ret = ihevcd_jobq_dequeue((jobq_t *)ps_proc->pv_proc_jobq, &s_job,
+ sizeof(proc_job_t), 1);
+ if((IHEVCD_ERROR_T)IHEVCD_SUCCESS != ret)
+ break;
+
+ ps_proc->i4_ctb_cnt = s_job.i2_ctb_cnt;
+ ps_proc->i4_ctb_x = s_job.i2_ctb_x;
+ ps_proc->i4_ctb_y = s_job.i2_ctb_y;
+ ps_proc->i4_cur_slice_idx = s_job.i2_slice_idx;
+
+
+
+ if(CMD_PROCESS == s_job.i4_cmd)
+ {
+ ihevcd_init_proc_ctxt(ps_proc, s_job.i4_tu_coeff_data_ofst);
+#ifdef GPU_BUILD
+ if(1) //g_enable_gpu == 1)
+ {
+
+ if(s_job.i2_wait)
+ {
+ //long long start_time, stop_time;
+ //start_time = itGetUs();
+ //printf("Before MC wait\n");
+ ihevcd_gpu_mc_wait(ps_proc, s_job.i2_granularity_idx);
+ //printf("After MC wait\n");
+ //stop_time = itGetUs();
+ //printf("CL Wait time time = %lld us\n", (stop_time - start_time));
+ }
+
+ }
+#endif
+ ihevcd_process(ps_proc);
+ }
+ else if(CMD_FMTCONV == s_job.i4_cmd)
+ {
+ sps_t *ps_sps;
+ codec_t *ps_codec;
+ ivd_out_bufdesc_t *ps_out_buffer;
+ WORD32 num_rows;
+
+ if(0 == ps_proc->i4_init_done)
+ {
+ ihevcd_init_proc_ctxt(ps_proc, 0);
+ }
+ ps_sps = ps_proc->ps_sps;
+ ps_codec = ps_proc->ps_codec;
+ ps_out_buffer = ps_proc->ps_out_buffer;
+ num_rows = 1 << ps_sps->i1_log2_ctb_size;
+
+ num_rows = MIN(num_rows, (ps_codec->i4_disp_ht - (s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size)));
+
+ if(num_rows < 0)
+ num_rows = 0;
+
+ ihevcd_fmt_conv(ps_proc->ps_codec, ps_proc, ps_out_buffer->pu1_bufs[0], ps_out_buffer->pu1_bufs[1], ps_out_buffer->pu1_bufs[2],
+ s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size, num_rows);
+ }
+ }
+ //ithread_exit(0);
+ return;
+}
+
diff --git a/decoder/ihevcd_process_slice.h b/decoder/ihevcd_process_slice.h
new file mode 100644
index 0000000..367a243
--- /dev/null
+++ b/decoder/ihevcd_process_slice.h
@@ -0,0 +1,44 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_slice.h
+*
+* @brief
+* Processing of slice level data
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PROCESS_SLICE_H_
+#define _IHEVCD_PROCESS_SLICE_H_
+
+IHEVCD_ERROR_T ihevcd_process(process_ctxt_t *ps_proc);
+void ihevcd_init_proc_ctxt(process_ctxt_t *ps_proc, WORD32 tu_coeff_data_ofst);
+void ihevcd_process_thread(process_ctxt_t *ps_proc);
+
+#endif /* _IHEVCD_PROCESS_SLICE_H_ */
diff --git a/decoder/ihevcd_profile.h b/decoder/ihevcd_profile.h
new file mode 100644
index 0000000..2e95e5c
--- /dev/null
+++ b/decoder/ihevcd_profile.h
@@ -0,0 +1,105 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_profile.h
+*
+* @brief
+* Contains macros for profiling individual modules of decoder
+*
+* @author
+* Naveen SR
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_PROFILE_H_
+#define _IHEVCD_PROFILE_H_
+
+#include "ihevc_defs.h"
+/* Define return; to disable individual module */
+#ifdef PROFILE_DIS_SAO_LEAF_LEVEL
+#define PROFILE_DISABLE_SAO_LEAF_LEVEL() return;
+#else
+#define PROFILE_DISABLE_SAO_LEAF_LEVEL() ;
+#endif
+
+#ifdef PROFILE_DIS_SAO
+#define PROFILE_DISABLE_SAO() return;
+#else
+#define PROFILE_DISABLE_SAO() ;
+#endif
+
+#ifdef PROFILE_DIS_DEBLK
+#define PROFILE_DISABLE_DEBLK() return;
+#else
+#define PROFILE_DISABLE_DEBLK() ;
+#endif
+
+#ifdef PROFILE_DIS_IQ_IT_RECON_INTRA_PRED
+#define PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED() return;
+#else
+#define PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED() ;
+#endif
+
+#ifdef PROFILE_DIS_INTER_PRED
+#define PROFILE_DISABLE_INTER_PRED() return;
+#else
+#define PROFILE_DISABLE_INTER_PRED() ;
+#endif
+
+#ifdef PROFILE_DIS_PROCESS_CTB
+#define PROFILE_DISABLE_PROCESS_CTB() return;
+/* When processing is disabled, no point in format converion as well */
+#define PROFILE_DISABLE_FMT_CONV() return 0;
+#define PROFILE_DIS_PROCESS_CTB_SET_NOOUTPUT() ps_dec_op->u4_output_present = 0;
+#else
+#define PROFILE_DISABLE_PROCESS_CTB() ;
+#define PROFILE_DIS_PROCESS_CTB_SET_NOOUTPUT() ;
+#define PROFILE_DISABLE_FMT_CONV();
+#endif
+
+#ifdef PROFILE_DIS_BOUNDARY_STRENGTH
+#define PROFILE_DISABLE_BOUNDARY_STRENGTH() return;
+#else
+#define PROFILE_DISABLE_BOUNDARY_STRENGTH() ;
+#endif
+
+#ifdef PROFILE_DIS_MV_PREDICTION
+#define PROFILE_DISABLE_MV_PREDICTION() return;
+#else
+#define PROFILE_DISABLE_MV_PREDICTION() ;
+#endif
+
+//#define PROFILE_DISABLE_INTER_PRED_LUMA(clr_indx) {if(clr_indx == 0) continue;}
+//#define PROFILE_DISABLE_INTER_PRED_CHROMA(clr_indx) {if(clr_indx == 1) continue;}
+//#define PROFILE_DISABLE_INTER_PRED_LUMA_AVERAGING(clr_indx) {if(clr_indx == 0) continue;}
+//#define PROFILE_DISABLE_INTER_PRED_CHROMA_AVERAGING(clr_indx) {if(clr_indx == 1) continue;}
+
+#define PROFILE_DISABLE_INTER_PRED_LUMA(clr_indx) ;
+#define PROFILE_DISABLE_INTER_PRED_CHROMA(clr_indx) ;
+#define PROFILE_DISABLE_INTER_PRED_LUMA_AVERAGING(clr_indx) ;
+#define PROFILE_DISABLE_INTER_PRED_CHROMA_AVERAGING(clr_indx) ;
+
+#endif /* _IHEVCD_PROFILE_H_ */
diff --git a/decoder/ihevcd_ref_list.c b/decoder/ihevcd_ref_list.c
new file mode 100644
index 0000000..e04a756
--- /dev/null
+++ b/decoder/ihevcd_ref_list.c
@@ -0,0 +1,558 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_api.c
+*
+* @brief
+* Contains functions definitions for reference list generation
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:\
+* - ihevcd_ref_pic
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_debug.h"
+#include "ihevcd_error.h"
+
+
+mv_buf_t* ihevcd_mv_mgr_get_poc(buf_mgr_t *ps_mv_buf_mgr, UWORD32 abs_poc)
+{
+ UWORD32 i;
+ mv_buf_t *ps_mv_buf = NULL;
+
+
+
+ for(i = 0; i < ps_mv_buf_mgr->u4_max_buf_cnt; i++)
+ {
+ ps_mv_buf = (mv_buf_t *)ps_mv_buf_mgr->apv_ptr[i];
+ if(ps_mv_buf && (ps_mv_buf->i4_abs_poc == (WORD32)abs_poc))
+ {
+ break;
+ }
+ }
+
+ return ps_mv_buf;
+}
+
+
+WORD32 ihevcd_ref_list(codec_t *ps_codec, pps_t *ps_pps, sps_t *ps_sps, slice_header_t *ps_slice_hdr)
+{
+ WORD32 i;
+ WORD32 st_rps_idx;
+ WORD32 num_neg_pics, num_pos_pics;
+ WORD8 *pi1_used;
+ WORD16 *pi2_delta_poc;
+ UWORD32 u4_max_poc_lsb;
+ pic_buf_t *ps_pic_buf;
+ mv_buf_t *ps_mv_buf;
+ UWORD32 r_idx;
+
+ dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+ buf_mgr_t *ps_mv_buf_mgr = (buf_mgr_t *)ps_codec->pv_mv_buf_mgr;
+
+ WORD32 ai4_poc_st_curr_before[MAX_DPB_SIZE], ai4_poc_st_foll[MAX_DPB_SIZE], ai4_poc_st_curr_after[MAX_DPB_SIZE];
+ WORD32 ai4_poc_lt_curr[MAX_DPB_SIZE], ai4_poc_lt_foll[MAX_DPB_SIZE];
+ UWORD32 u4_num_st_curr_before, u4_num_st_foll, u4_num_st_curr_after, u4_num_lt_curr, u4_num_lt_foll;
+ UWORD32 u4_num_total_curr;
+
+ WORD8 ai1_curr_delta_poc_msb_present_flag[MAX_DPB_SIZE], ai1_foll_delta_poc_msb_present_flag[MAX_DPB_SIZE];
+
+ pic_buf_t *as_ref_pic_lt_curr[MAX_DPB_SIZE];
+ pic_buf_t *as_ref_pic_lt_foll[MAX_DPB_SIZE];
+ pic_buf_t *as_ref_pic_st_curr_after[MAX_DPB_SIZE];
+ pic_buf_t *as_ref_pic_st_curr_before[MAX_DPB_SIZE];
+ pic_buf_t *as_ref_pic_st_foll[MAX_DPB_SIZE];
+
+ pic_buf_t *as_ref_pic_list_temp0[MAX_DPB_SIZE], *as_ref_pic_list_temp1[MAX_DPB_SIZE];
+
+ UWORD32 u4_num_rps_curr_temp_list0, u4_num_rps_curr_temp_list1;
+
+ WORD32 i4_pic_order_cnt_val;
+ WORD32 i4_poc_lt;
+ UNUSED(as_ref_pic_lt_foll);
+ UNUSED(as_ref_pic_st_foll);
+ UNUSED(ps_pps);
+
+ RETURN_IF_NAL_INFO;
+
+ u4_max_poc_lsb = (1 << ps_sps->i1_log2_max_pic_order_cnt_lsb);
+
+ i4_pic_order_cnt_val = ps_slice_hdr->i4_abs_pic_order_cnt;
+
+ if(1 == ps_slice_hdr->i1_short_term_ref_pic_set_sps_flag)
+ {
+ st_rps_idx = ps_slice_hdr->i1_short_term_ref_pic_set_idx;
+ num_neg_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_neg_pics;
+ num_pos_pics = ps_sps->as_stref_picset[st_rps_idx].i1_num_pos_pics;
+ pi1_used = ps_sps->as_stref_picset[st_rps_idx].ai1_used;
+ pi2_delta_poc = ps_sps->as_stref_picset[st_rps_idx].ai2_delta_poc;
+ }
+ else
+ {
+ st_rps_idx = ps_sps->i1_num_short_term_ref_pic_sets;
+ num_neg_pics = ps_slice_hdr->s_stref_picset.i1_num_neg_pics;
+ num_pos_pics = ps_slice_hdr->s_stref_picset.i1_num_pos_pics;
+ pi1_used = ps_slice_hdr->s_stref_picset.ai1_used;
+ pi2_delta_poc = ps_slice_hdr->s_stref_picset.ai2_delta_poc;
+ }
+
+ u4_num_st_curr_before = 0;
+ u4_num_st_foll = 0;
+ for(i = 0; i < num_neg_pics; i++)
+ {
+ if(pi1_used[i])
+ {
+ ai4_poc_st_curr_before[u4_num_st_curr_before] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+ u4_num_st_curr_before++;
+ }
+ else
+ {
+ ai4_poc_st_foll[u4_num_st_foll] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+ u4_num_st_foll++;
+ }
+ }
+ u4_num_st_curr_after = 0;
+ for(i = num_neg_pics; i < num_neg_pics + num_pos_pics; i++)
+ {
+ if(pi1_used[i])
+ {
+ ai4_poc_st_curr_after[u4_num_st_curr_after] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+ u4_num_st_curr_after++;
+ }
+ else
+ {
+ ai4_poc_st_foll[u4_num_st_foll] = i4_pic_order_cnt_val + pi2_delta_poc[i];
+ u4_num_st_foll++;
+ }
+ }
+
+ u4_num_lt_curr = 0;
+ u4_num_lt_foll = 0;
+ for(i = 0; i < ps_slice_hdr->i1_num_long_term_sps + ps_slice_hdr->i1_num_long_term_pics; i++)
+ {
+ i4_poc_lt = ps_slice_hdr->ai4_poc_lsb_lt[i];
+ if(ps_slice_hdr->ai1_delta_poc_msb_present_flag[i])
+ {
+ i4_poc_lt += i4_pic_order_cnt_val - ps_slice_hdr->ai1_delta_poc_msb_cycle_lt[i] * u4_max_poc_lsb - ps_slice_hdr->i4_pic_order_cnt_lsb;
+ }
+
+ if(ps_slice_hdr->ai1_used_by_curr_pic_lt_flag[i])
+ {
+ ai4_poc_lt_curr[u4_num_lt_curr] = i4_poc_lt;
+ ai1_curr_delta_poc_msb_present_flag[u4_num_lt_curr] = ps_slice_hdr->ai1_delta_poc_msb_present_flag[i];
+ u4_num_lt_curr++;
+ }
+ else
+ {
+ ai4_poc_lt_foll[u4_num_lt_foll] = i4_poc_lt;
+ ai1_foll_delta_poc_msb_present_flag[u4_num_lt_foll] = ps_slice_hdr->ai1_delta_poc_msb_present_flag[i];
+ u4_num_lt_foll++;
+ }
+ }
+
+ u4_num_total_curr = u4_num_lt_curr + u4_num_st_curr_after + u4_num_st_curr_before;
+
+ /* Bit stream conformance tests */
+/*
+ for(i = 0; i < u4_num_lt_curr; i++)
+ {
+ int j;
+ if(ai1_curr_delta_poc_msb_present_flag[i])
+ {
+ for(j = 0; j < u4_num_st_curr_before; j++)
+ {
+ ASSERT(ai4_poc_st_curr_before[j] != ai4_poc_lt_curr[i]);
+ }
+ for(j = 0; j < u4_num_st_curr_after; j++)
+ {
+ ASSERT(ai4_poc_st_curr_after[j] != ai4_poc_lt_curr[i]);
+ }
+ for(j = 0; j < u4_num_st_foll; j++)
+ {
+ ASSERT(ai4_poc_st_foll[j] != ai4_poc_lt_curr[i]);
+ }
+ for(j = 0; j < u4_num_lt_curr; j++)
+ {
+ ASSERT((ai4_poc_lt_curr[j] != ai4_poc_lt_curr[i]) || (j == i));
+ }
+ }
+ else
+ {
+ for(j = 0; j < u4_num_st_curr_before; j++)
+ {
+ ASSERT((ai4_poc_st_curr_before[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]);
+ }
+ for(j = 0; j < u4_num_st_curr_after; j++)
+ {
+ ASSERT((ai4_poc_st_curr_after[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]);
+ }
+ for(j = 0; j < u4_num_st_foll; j++)
+ {
+ ASSERT((ai4_poc_st_foll[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]);
+ }
+ for(j = 0; j < u4_num_lt_curr; j++)
+ {
+ ASSERT(((ai4_poc_lt_curr[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_curr[i]) || (j == i));
+ }
+ }
+ }
+
+ for(i = 0; i < u4_num_lt_foll; i++)
+ {
+ int j;
+ if(ai1_foll_delta_poc_msb_present_flag[i])
+ {
+ for(j = 0; j < u4_num_st_curr_before; j++)
+ {
+ ASSERT(ai4_poc_st_curr_before[j] != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_st_curr_after; j++)
+ {
+ ASSERT(ai4_poc_st_curr_after[j] != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_st_foll; j++)
+ {
+ ASSERT(ai4_poc_st_foll[j] != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_lt_curr; j++)
+ {
+ ASSERT(ai4_poc_lt_curr[j] != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_lt_foll; j++)
+ {
+ ASSERT((ai4_poc_lt_foll[j] != ai4_poc_lt_foll[i]) || (j == i));
+ }
+ }
+ else
+ {
+ for(j = 0; j < u4_num_st_curr_before; j++)
+ {
+ ASSERT((ai4_poc_st_curr_before[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_st_curr_after; j++)
+ {
+ ASSERT((ai4_poc_st_curr_after[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_st_foll; j++)
+ {
+ ASSERT((ai4_poc_st_foll[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_lt_curr; j++)
+ {
+ ASSERT((ai4_poc_lt_curr[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]);
+ }
+ for(j = 0; j < u4_num_lt_foll; j++)
+ {
+ ASSERT(((ai4_poc_lt_foll[j] & (u4_max_poc_lsb - 1)) != ai4_poc_lt_foll[i]) || (j == i));
+ }
+ }
+ }
+*/
+
+
+ /* Reference Pic sets creation */
+
+ /* Set all the DPB buffers to UNUSED_FOR_REF */
+ if(0 == ps_codec->i4_pic_present)
+ {
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if(ps_dpb_mgr->as_dpb_info[i].ps_pic_buf)
+ ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref = UNUSED_FOR_REF;
+ }
+ }
+
+ for(i = 0; i < (WORD32)u4_num_lt_curr; i++)
+ {
+ if(0 == ai1_curr_delta_poc_msb_present_flag[i])
+ {
+ ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc_lsb(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+ if(NULL != ps_pic_buf)
+ ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+ as_ref_pic_lt_curr[i] = ps_pic_buf;
+ }
+ else
+ {
+ ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+ if(NULL != ps_pic_buf)
+ ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+ as_ref_pic_lt_curr[i] = ps_pic_buf;
+ }
+ }
+
+ for(i = 0; i < (WORD32)u4_num_lt_foll; i++)
+ {
+ if(0 == ai1_foll_delta_poc_msb_present_flag[i])
+ {
+ ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc_lsb(ps_dpb_mgr, ai4_poc_lt_foll[i]);
+ if(NULL != ps_pic_buf)
+ ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+ as_ref_pic_lt_foll[i] = ps_pic_buf;
+ }
+ else
+ {
+ ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_lt_foll[i]);
+ if(NULL != ps_pic_buf)
+ ps_pic_buf->u1_used_as_ref = LONG_TERM_REF;
+
+ as_ref_pic_lt_foll[i] = ps_pic_buf;
+ }
+ }
+
+
+ for(i = 0; i < (WORD32)u4_num_st_curr_before; i++)
+ {
+
+ ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_st_curr_before[i]);
+ if(NULL != ps_pic_buf)
+ ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+ as_ref_pic_st_curr_before[i] = ps_pic_buf;
+ }
+
+ for(i = 0; i < (WORD32)u4_num_st_curr_after; i++)
+ {
+ ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_st_curr_after[i]);
+ if(NULL != ps_pic_buf)
+ ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+ as_ref_pic_st_curr_after[i] = ps_pic_buf;
+ }
+
+ for(i = 0; i < (WORD32)u4_num_st_foll; i++)
+ {
+ ps_pic_buf = ihevc_dpb_mgr_get_ref_by_poc(ps_dpb_mgr, ai4_poc_st_foll[i]);
+ if(NULL != ps_pic_buf)
+ ps_pic_buf->u1_used_as_ref = SHORT_TERM_REF;
+
+ as_ref_pic_st_foll[i] = ps_pic_buf;
+ }
+
+ //TODO: Bit stream conformance tests to be included
+
+ u4_num_rps_curr_temp_list0 = (WORD32)u4_num_total_curr > ps_slice_hdr->i1_num_ref_idx_l0_active ? (WORD32)u4_num_total_curr : ps_slice_hdr->i1_num_ref_idx_l0_active;
+
+ r_idx = 0;
+ if((PSLICE == ps_slice_hdr->i1_slice_type) ||
+ (BSLICE == ps_slice_hdr->i1_slice_type))
+ {
+ while(r_idx < u4_num_rps_curr_temp_list0)
+ {
+ for(i = 0; (i < (WORD32)u4_num_st_curr_before) && (r_idx < u4_num_rps_curr_temp_list0); r_idx++, i++)
+ {
+ if(NULL == as_ref_pic_st_curr_before[i])
+ {
+ as_ref_pic_st_curr_before[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_before[i]);
+ }
+ as_ref_pic_list_temp0[r_idx] = as_ref_pic_st_curr_before[i];
+ }
+
+ for(i = 0; (i < (WORD32)u4_num_st_curr_after) && (r_idx < u4_num_rps_curr_temp_list0); r_idx++, i++)
+ {
+ if(NULL == as_ref_pic_st_curr_after[i])
+ {
+ as_ref_pic_st_curr_after[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_after[i]);
+ }
+ as_ref_pic_list_temp0[r_idx] = as_ref_pic_st_curr_after[i];
+ }
+
+ for(i = 0; (i < (WORD32)u4_num_lt_curr) && (r_idx < u4_num_rps_curr_temp_list0); r_idx++, i++)
+ {
+ if(NULL == as_ref_pic_lt_curr[i])
+ {
+ as_ref_pic_lt_curr[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+ }
+ as_ref_pic_list_temp0[r_idx] = as_ref_pic_lt_curr[i];
+ }
+ }
+
+ for(r_idx = 0; (WORD32)r_idx < ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx++)
+ {
+ pic_buf_t *ps_pic_buf;
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l0 ? (void *)as_ref_pic_list_temp0[ps_slice_hdr->s_rplm.i1_list_entry_l0[r_idx]] : (void *)as_ref_pic_list_temp0[r_idx];
+ ps_pic_buf = (pic_buf_t *)ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf;
+
+ if(ps_pic_buf == NULL)
+ return IHEVCD_REF_PIC_NOT_FOUND;
+
+ ps_mv_buf = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf->i4_abs_poc);
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = ps_mv_buf;
+ }
+
+
+ if(ps_slice_hdr->i1_slice_type == BSLICE)
+ {
+ u4_num_rps_curr_temp_list1 = (WORD32)u4_num_total_curr > ps_slice_hdr->i1_num_ref_idx_l1_active ? (WORD32)u4_num_total_curr : ps_slice_hdr->i1_num_ref_idx_l1_active;
+
+ r_idx = 0;
+ while(r_idx < u4_num_rps_curr_temp_list1)
+ {
+ for(i = 0; (i < (WORD32)u4_num_st_curr_after) && (r_idx < u4_num_rps_curr_temp_list1); r_idx++, i++)
+ {
+ if(NULL == as_ref_pic_st_curr_after[i])
+ {
+ as_ref_pic_st_curr_after[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_after[i]);
+ }
+ as_ref_pic_list_temp1[r_idx] = as_ref_pic_st_curr_after[i];
+ }
+
+ for(i = 0; (i < (WORD32)u4_num_st_curr_before) && (r_idx < u4_num_rps_curr_temp_list1); r_idx++, i++)
+ {
+ if(NULL == as_ref_pic_st_curr_before[i])
+ {
+ as_ref_pic_st_curr_before[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_st_curr_before[i]);
+ }
+ as_ref_pic_list_temp1[r_idx] = as_ref_pic_st_curr_before[i];
+ }
+
+ for(i = 0; (i < (WORD32)u4_num_lt_curr) && (r_idx < u4_num_rps_curr_temp_list1); r_idx++, i++)
+ {
+ if(NULL == as_ref_pic_lt_curr[i])
+ {
+ as_ref_pic_lt_curr[i] = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ai4_poc_lt_curr[i]);
+ }
+ as_ref_pic_list_temp1[r_idx] = as_ref_pic_lt_curr[i];
+ }
+ }
+
+ for(r_idx = 0; (WORD32)r_idx < ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx++)
+ {
+ pic_buf_t *ps_pic_buf;
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = ps_slice_hdr->s_rplm.i1_ref_pic_list_modification_flag_l1 ? (void *)as_ref_pic_list_temp1[ps_slice_hdr->s_rplm.i1_list_entry_l1[r_idx]] : (void *)as_ref_pic_list_temp1[r_idx];
+ ps_pic_buf = (pic_buf_t *)ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf;
+
+ if(ps_pic_buf == NULL)
+ return IHEVCD_REF_PIC_NOT_FOUND;
+
+ ps_mv_buf = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf->i4_abs_poc);
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = ps_mv_buf;
+ }
+ }
+ }
+
+ DEBUG_PRINT_REF_LIST_POCS(i4_pic_order_cnt_val, ps_slice_hdr, ps_dpb_mgr, u4_num_st_curr_before, u4_num_st_curr_after, u4_num_st_foll, u4_num_lt_curr, u4_num_lt_foll, ai4_poc_st_curr_before, ai4_poc_st_curr_after, ai4_poc_st_foll, ai4_poc_lt_curr, ai4_poc_lt_foll);
+#ifndef GPU_BUILD
+ /* Buffers that are still marked as UNUSED_FOR_REF are released from dpb (internally dpb calls release from pic buf manager)*/
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) && (UNUSED_FOR_REF == ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref))
+ {
+ pic_buf_t *ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+ mv_buf_t *ps_mv_buf;
+
+ /* Long term index is set to MAX_DPB_BUFS to ensure it is not added as LT */
+ ihevc_dpb_mgr_del_ref(ps_dpb_mgr, (buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf->i4_abs_poc);
+
+
+ /* Find buffer id of the MV bank corresponding to the buffer being freed (Buffer with POC of u4_abs_poc) */
+ ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
+ for(i = 0; i < BUF_MGR_MAX_CNT; i++)
+ {
+ if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+ {
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
+ break;
+ }
+ ps_mv_buf++;
+ }
+ }
+
+ }
+#endif
+
+ return IHEVCD_SUCCESS;
+}
+#ifdef GPU_BUILD
+void ihevcd_free_ref_mv_buffers(codec_t *ps_codec)
+{
+ WORD32 i;
+ dpb_mgr_t *ps_dpb_mgr = ps_codec->pv_dpb_mgr;
+ // TODO
+ /* Buffers that are still marked as UNUSED_FOR_REF are released from dpb (internally dpb calls release from pic buf manager)*/
+ for(i = 0; i < MAX_DPB_BUFS; i++)
+ {
+ if((ps_dpb_mgr->as_dpb_info[i].ps_pic_buf) && (UNUSED_FOR_REF == ps_dpb_mgr->as_dpb_info[i].ps_pic_buf->u1_used_as_ref))
+ {
+ pic_buf_t *ps_pic_buf = ps_dpb_mgr->as_dpb_info[i].ps_pic_buf;
+ mv_buf_t *ps_mv_buf;
+
+ /* Long term index is set to MAX_DPB_BUFS to ensure it is not added as LT */
+ ihevc_dpb_mgr_del_ref(ps_dpb_mgr, (buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf->i4_abs_poc);
+
+
+ /* Find buffer id of the MV bank corresponding to the buffer being freed (Buffer with POC of u4_abs_poc) */
+ ps_mv_buf = (mv_buf_t *)ps_codec->ps_mv_buf;
+ for(i = 0; i < BUF_MGR_MAX_CNT; i++)
+ {
+ if(ps_mv_buf->i4_abs_poc == ps_pic_buf->i4_abs_poc)
+ {
+ ihevc_buf_mgr_release((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, i, BUF_MGR_REF);
+ break;
+ }
+ ps_mv_buf++;
+ }
+ }
+
+ }
+
+ return IHEVCD_SUCCESS;
+}
+#endif
diff --git a/decoder/ihevcd_ref_list.h b/decoder/ihevcd_ref_list.h
new file mode 100644
index 0000000..7bc22f7
--- /dev/null
+++ b/decoder/ihevcd_ref_list.h
@@ -0,0 +1,39 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_ref_list.h
+*
+* @brief
+* Contains functions definitions for reference list generation
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_REF_LIST_H_
+#define _IHEVCD_REF_LIST_H_
+
+WORD32 ihevcd_ref_list(codec_t *ps_codec, pps_t *ps_pps, sps_t *ps_sps, slice_header_t *ps_slice_hdr);
+
+#endif
diff --git a/decoder/ihevcd_sao.c b/decoder/ihevcd_sao.c
new file mode 100644
index 0000000..d8e8f5c
--- /dev/null
+++ b/decoder/ihevcd_sao.c
@@ -0,0 +1,3348 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ * ihevc_sao.c
+ *
+ * @brief
+ * Contains function definitions for sample adaptive offset process
+ *
+ * @author
+ * Srinivas T
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_sao.h"
+#include "ihevc_mem_fns.h"
+
+#include "ihevc_error.h"
+#include "ihevc_common_tables.h"
+
+#include "ihevcd_trace.h"
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_job_queue.h"
+#include "ihevcd_utils.h"
+
+#include "ihevc_deblk.h"
+#include "ihevc_deblk_tables.h"
+#include "ihevcd_profile.h"
+#include "ihevcd_sao.h"
+#include "ihevcd_debug.h"
+
+#define SAO_SHIFT_CTB 8
+
+/**
+ * SAO at CTB level is implemented for a shifted CTB(8 pixels in x and y directions)
+ */
+void ihevcd_sao_ctb(sao_ctxt_t *ps_sao_ctxt)
+{
+ codec_t *ps_codec = ps_sao_ctxt->ps_codec;
+ UWORD8 *pu1_src_luma;
+ UWORD8 *pu1_src_chroma;
+ WORD32 src_strd;
+ WORD32 ctb_size;
+ WORD32 log2_ctb_size;
+ sps_t *ps_sps;
+ sao_t *ps_sao;
+ WORD32 row, col;
+ UWORD8 au1_avail_luma[8];
+ UWORD8 au1_avail_chroma[8];
+ WORD32 i;
+ UWORD8 *pu1_src_top_luma;
+ UWORD8 *pu1_src_top_chroma;
+ UWORD8 *pu1_src_left_luma;
+ UWORD8 *pu1_src_left_chroma;
+ UWORD8 au1_src_top_right[2];
+ UWORD8 au1_src_bot_left[2];
+ UWORD8 *pu1_no_loop_filter_flag;
+ WORD32 loop_filter_strd;
+
+ WORD8 ai1_offset_y[5];
+ WORD8 ai1_offset_cb[5];
+ WORD8 ai1_offset_cr[5];
+
+ PROFILE_DISABLE_SAO();
+
+ ai1_offset_y[0] = 0;
+ ai1_offset_cb[0] = 0;
+ ai1_offset_cr[0] = 0;
+
+ ps_sps = ps_sao_ctxt->ps_sps;
+ log2_ctb_size = ps_sps->i1_log2_ctb_size;
+ ctb_size = (1 << log2_ctb_size);
+ src_strd = ps_sao_ctxt->ps_codec->i4_strd;
+ pu1_src_luma = ps_sao_ctxt->pu1_cur_pic_luma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd) << (log2_ctb_size));
+ pu1_src_chroma = ps_sao_ctxt->pu1_cur_pic_chroma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd / 2) << (log2_ctb_size));
+
+ ps_sao = ps_sao_ctxt->ps_pic_sao + ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ loop_filter_strd = (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
+
+ /* Current CTB */
+ {
+ WORD32 sao_wd_luma;
+ WORD32 sao_wd_chroma;
+ WORD32 sao_ht_luma;
+ WORD32 sao_ht_chroma;
+
+ WORD32 remaining_rows;
+ WORD32 remaining_cols;
+
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+ sao_wd_luma = MIN(ctb_size, remaining_cols);
+ sao_wd_chroma = MIN(ctb_size, remaining_cols);
+
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples - (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+ sao_ht_luma = MIN(ctb_size, remaining_rows);
+ sao_ht_chroma = MIN(ctb_size, remaining_rows) / 2;
+
+ pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+ pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+ pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+ pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+
+ pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+ ((ps_sao_ctxt->i4_ctb_y * ctb_size) / 8) * loop_filter_strd +
+ ((ps_sao_ctxt->i4_ctb_x * ctb_size) / 64);
+
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_luma[i] = 255;
+ au1_avail_chroma[i] = 255;
+ }
+
+
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_luma[0] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[6] = 0;
+
+ au1_avail_chroma[0] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[6] = 0;
+ }
+
+ if(ps_sps->i2_pic_wd_in_ctb - 1 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_luma[1] = 0;
+ au1_avail_luma[5] = 0;
+ au1_avail_luma[7] = 0;
+
+ au1_avail_chroma[1] = 0;
+ au1_avail_chroma[5] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_luma[2] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[5] = 0;
+
+ au1_avail_chroma[2] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[5] = 0;
+ }
+
+ if(ps_sps->i2_pic_ht_in_ctb - 1 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_luma[3] = 0;
+ au1_avail_luma[6] = 0;
+ au1_avail_luma[7] = 0;
+
+ au1_avail_chroma[3] = 0;
+ au1_avail_chroma[6] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+
+ if(0 == ps_sao->b3_y_type_idx)
+ {
+ /* Update left, top and top-left */
+ for(row = 0; row < sao_ht_luma; row++)
+ {
+ pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+ }
+ ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+ }
+ else
+ {
+ UWORD8 au1_src_copy[(MAX_CTB_SIZE + 2) * (MAX_CTB_SIZE + 2)];
+ UWORD8 *pu1_src_copy = au1_src_copy + (MAX_CTB_SIZE + 2) + 1;
+ WORD32 tmp_strd = MAX_CTB_SIZE + 2;
+ WORD32 no_loop_filter_enabled = 0;
+
+ /* Check the loop filter flags and copy the original values for back up */
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 min_cu = 8;
+ UWORD8 *pu1_src_tmp = pu1_src_luma;
+
+ for(i = 0; i < (sao_ht_luma + min_cu - 1) / min_cu; i++)
+ {
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+ ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma) / 8) % 8);
+ u4_no_loop_filter_flag &= (1 << ((sao_wd_luma + (min_cu - 1)) / min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ WORD32 tmp_wd = sao_wd_luma;
+ no_loop_filter_enabled = 1;
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+ }
+ else
+ {
+ for(row = 0; row < MIN(min_cu, sao_ht_luma - (i - 1) * min_cu); row++)
+ {
+ for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+ {
+ pu1_src_copy[row * src_strd + col] = pu1_src_tmp[row * tmp_strd + col];
+ }
+ }
+
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+ }
+ }
+
+ pu1_src_tmp -= sao_wd_luma;
+ }
+
+ pu1_src_tmp += min_cu * src_strd;
+ pu1_src_copy += min_cu * tmp_strd;
+ }
+ }
+
+ if(1 == ps_sao->b3_y_type_idx)
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb,
+ ps_sao->b5_y_band_pos,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma);
+ }
+ else // if(2 <= ps_sao->b3_y_type_idx)
+ {
+ au1_src_top_right[0] = pu1_src_top_luma[sao_wd_luma];
+ au1_src_bot_left[0] = pu1_src_luma[sao_ht_luma * src_strd - 1];
+ ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb,
+ au1_src_top_right,
+ au1_src_bot_left,
+ au1_avail_luma,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma);
+ }
+
+ /* Check the loop filter flags and copy the original values back if they are set */
+ if(no_loop_filter_enabled)
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 min_cu = 8;
+ UWORD8 *pu1_src_tmp = pu1_src_luma;
+
+ for(i = 0; i < (sao_ht_luma + min_cu - 1) / min_cu; i++)
+ {
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >> ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma) / 8) % 8);
+ u4_no_loop_filter_flag &= (1 << ((sao_wd_luma + (min_cu - 1)) / min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ WORD32 tmp_wd = sao_wd_luma;
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+ }
+ else
+ {
+ for(row = 0; row < MIN(min_cu, sao_ht_luma - (i - 1) * min_cu); row++)
+ {
+ for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+ {
+ pu1_src_tmp[row * src_strd + col] = pu1_src_copy[row * tmp_strd + col];
+ }
+ }
+
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+ }
+ }
+
+ pu1_src_tmp -= sao_wd_luma;
+ }
+
+ pu1_src_tmp += min_cu * src_strd;
+ pu1_src_copy += min_cu * tmp_strd;
+ }
+ }
+
+ }
+
+ if(0 == ps_sao->b3_cb_type_idx)
+ {
+ for(row = 0; row < sao_ht_chroma; row++)
+ {
+ pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+ pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+ }
+ ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+ ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+ }
+ else
+ {
+ UWORD8 au1_src_copy[(MAX_CTB_SIZE + 4) * (MAX_CTB_SIZE + 2)];
+ UWORD8 *pu1_src_copy = au1_src_copy + (MAX_CTB_SIZE + 4) + 2;
+ WORD32 tmp_strd = MAX_CTB_SIZE + 4;
+ WORD32 no_loop_filter_enabled = 0;
+
+ /* Check the loop filter flags and copy the original values for back up */
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 min_cu = 4;
+ UWORD8 *pu1_src_tmp = pu1_src_chroma;
+
+ for(i = 0; i < (sao_ht_chroma + min_cu - 1) / min_cu; i++)
+ {
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >> ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma) / 8) % 8);
+ u4_no_loop_filter_flag &= (1 << ((sao_wd_chroma + (min_cu - 1)) / min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ WORD32 tmp_wd = sao_wd_chroma;
+ no_loop_filter_enabled = 1;
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+ }
+ else
+ {
+ for(row = 0; row < MIN(min_cu, sao_ht_chroma - (i - 1) * min_cu); row++)
+ {
+ for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+ {
+ pu1_src_copy[row * src_strd + col] = pu1_src_tmp[row * tmp_strd + col];
+ }
+ }
+
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+ }
+ }
+
+ pu1_src_tmp -= sao_wd_chroma;
+ }
+
+ pu1_src_tmp += min_cu * src_strd;
+ pu1_src_copy += min_cu * tmp_strd;
+ }
+ }
+
+ if(1 == ps_sao->b3_cb_type_idx)
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb,
+ ps_sao->b5_cb_band_pos,
+ ps_sao->b5_cr_band_pos,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ else // if(2 <= ps_sao->b3_cb_type_idx)
+ {
+ au1_src_top_right[0] = pu1_src_top_chroma[sao_wd_chroma];
+ au1_src_top_right[1] = pu1_src_top_chroma[sao_wd_chroma + 1];
+ au1_src_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+ au1_src_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb,
+ au1_src_top_right,
+ au1_src_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+
+ /* Check the loop filter flags and copy the original values back if they are set */
+ if(no_loop_filter_enabled)
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 min_cu = 4;
+ UWORD8 *pu1_src_tmp = pu1_src_chroma;
+
+ for(i = 0; i < (sao_ht_chroma + min_cu - 1) / min_cu; i++)
+ {
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >> ((((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma) / 8) % 8);
+ u4_no_loop_filter_flag &= (1 << ((sao_wd_chroma + (min_cu - 1)) / min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ WORD32 tmp_wd = sao_wd_chroma;
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) * min_cu;
+ }
+ else
+ {
+ for(row = 0; row < MIN(min_cu, sao_ht_chroma - (i - 1) * min_cu); row++)
+ {
+ for(col = 0; col < MIN(CTZ(~u4_no_loop_filter_flag) * min_cu, tmp_wd); col++)
+ {
+ pu1_src_tmp[row * src_strd + col] = pu1_src_copy[row * tmp_strd + col];
+ }
+ }
+
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ pu1_src_tmp += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ pu1_src_copy += MIN(CTZ(~u4_no_loop_filter_flag), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) * min_cu;
+ }
+ }
+
+ pu1_src_tmp -= sao_wd_chroma;
+ }
+
+ pu1_src_tmp += min_cu * src_strd;
+ pu1_src_copy += min_cu * tmp_strd;
+ }
+ }
+
+ }
+
+ }
+}
+
+void ihevcd_sao_shift_ctb(sao_ctxt_t *ps_sao_ctxt)
+{
+ codec_t *ps_codec = ps_sao_ctxt->ps_codec;
+ UWORD8 *pu1_src_luma;
+ UWORD8 *pu1_src_chroma;
+ WORD32 src_strd;
+ WORD32 ctb_size;
+ WORD32 log2_ctb_size;
+ sps_t *ps_sps;
+ sao_t *ps_sao;
+ pps_t *ps_pps;
+ slice_header_t *ps_slice_hdr, *ps_slice_hdr_base;
+ tile_t *ps_tile;
+ UWORD16 *pu1_slice_idx;
+ UWORD16 *pu1_tile_idx;
+ WORD32 row, col;
+ UWORD8 au1_avail_luma[8];
+ UWORD8 au1_avail_chroma[8];
+ UWORD8 au1_tile_slice_boundary[8];
+ UWORD8 au4_ilf_across_tile_slice_enable[8];
+ WORD32 i;
+ UWORD8 *pu1_src_top_luma;
+ UWORD8 *pu1_src_top_chroma;
+ UWORD8 *pu1_src_left_luma;
+ UWORD8 *pu1_src_left_chroma;
+ UWORD8 au1_src_top_right[2];
+ UWORD8 au1_src_bot_left[2];
+ UWORD8 *pu1_no_loop_filter_flag;
+ UWORD8 *pu1_src_backup_luma;
+ UWORD8 *pu1_src_backup_chroma;
+ WORD32 backup_strd;
+ WORD32 loop_filter_strd;
+
+ WORD32 no_loop_filter_enabled_luma = 0;
+ WORD32 no_loop_filter_enabled_chroma = 0;
+ UWORD8 *pu1_sao_src_top_left_chroma_curr_ctb;
+ UWORD8 *pu1_sao_src_top_left_luma_curr_ctb;
+ UWORD8 *pu1_sao_src_luma_top_left_ctb;
+ UWORD8 *pu1_sao_src_chroma_top_left_ctb;
+ UWORD8 *pu1_sao_src_top_left_luma_top_right;
+ UWORD8 *pu1_sao_src_top_left_chroma_top_right;
+ UWORD8 u1_sao_src_top_left_luma_bot_left;
+ UWORD8 *pu1_sao_src_top_left_luma_bot_left;
+ UWORD8 *au1_sao_src_top_left_chroma_bot_left;
+ UWORD8 *pu1_sao_src_top_left_chroma_bot_left;
+
+ WORD8 ai1_offset_y[5];
+ WORD8 ai1_offset_cb[5];
+ WORD8 ai1_offset_cr[5];
+ WORD32 chroma_yuv420sp_vu = ps_sao_ctxt->is_chroma_yuv420sp_vu;
+
+ PROFILE_DISABLE_SAO();
+
+ ai1_offset_y[0] = 0;
+ ai1_offset_cb[0] = 0;
+ ai1_offset_cr[0] = 0;
+
+ ps_sps = ps_sao_ctxt->ps_sps;
+ ps_pps = ps_sao_ctxt->ps_pps;
+ ps_tile = ps_sao_ctxt->ps_tile;
+
+ log2_ctb_size = ps_sps->i1_log2_ctb_size;
+ ctb_size = (1 << log2_ctb_size);
+ src_strd = ps_sao_ctxt->ps_codec->i4_strd;
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_slice_hdr_base = ps_sao_ctxt->ps_slice_hdr_base;
+#else
+ ps_slice_hdr_base = ps_sao_ctxt->ps_codec->ps_slice_hdr_base;
+#endif
+ ps_slice_hdr = ps_slice_hdr_base + (ps_sao_ctxt->i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+
+ pu1_slice_idx = ps_sao_ctxt->pu1_slice_idx;
+ pu1_tile_idx = ps_sao_ctxt->pu1_tile_idx;
+ pu1_src_luma = ps_sao_ctxt->pu1_cur_pic_luma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd) << (log2_ctb_size));
+ pu1_src_chroma = ps_sao_ctxt->pu1_cur_pic_chroma + ((ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sao_ctxt->ps_codec->i4_strd / 2) << (log2_ctb_size));
+
+ /*Stores the left value for each row ctbs- Needed for column tiles*/
+ pu1_sao_src_top_left_luma_curr_ctb = ps_sao_ctxt->pu1_sao_src_top_left_luma_curr_ctb + ((ps_sao_ctxt->i4_ctb_y));
+ pu1_sao_src_top_left_chroma_curr_ctb = ps_sao_ctxt->pu1_sao_src_top_left_chroma_curr_ctb + (2 * (ps_sao_ctxt->i4_ctb_y));
+ pu1_sao_src_luma_top_left_ctb = ps_sao_ctxt->pu1_sao_src_luma_top_left_ctb + ((ps_sao_ctxt->i4_ctb_y));
+ pu1_sao_src_chroma_top_left_ctb = ps_sao_ctxt->pu1_sao_src_chroma_top_left_ctb + (2 * ps_sao_ctxt->i4_ctb_y);
+ u1_sao_src_top_left_luma_bot_left = ps_sao_ctxt->u1_sao_src_top_left_luma_bot_left; // + ((ps_sao_ctxt->i4_ctb_y));
+ pu1_sao_src_top_left_luma_bot_left = ps_sao_ctxt->pu1_sao_src_top_left_luma_bot_left + ((ps_sao_ctxt->i4_ctb_y));
+ au1_sao_src_top_left_chroma_bot_left = ps_sao_ctxt->au1_sao_src_top_left_chroma_bot_left; // + (2 * ps_sao_ctxt->i4_ctb_y);
+ pu1_sao_src_top_left_chroma_bot_left = ps_sao_ctxt->pu1_sao_src_top_left_chroma_bot_left + (2 * ps_sao_ctxt->i4_ctb_y);
+ pu1_sao_src_top_left_luma_top_right = ps_sao_ctxt->pu1_sao_src_top_left_luma_top_right + ((ps_sao_ctxt->i4_ctb_x));
+ pu1_sao_src_top_left_chroma_top_right = ps_sao_ctxt->pu1_sao_src_top_left_chroma_top_right + (2 * ps_sao_ctxt->i4_ctb_x);
+
+ ps_sao = ps_sao_ctxt->ps_pic_sao + ps_sao_ctxt->i4_ctb_x + ps_sao_ctxt->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb;
+ loop_filter_strd = (ps_sps->i2_pic_width_in_luma_samples + 63) >> 6;
+ backup_strd = 2 * MAX_CTB_SIZE;
+
+ DEBUG_INIT_TMP_BUF(ps_sao_ctxt->pu1_tmp_buf_luma, ps_sao_ctxt->pu1_tmp_buf_chroma);
+
+ {
+ /* Check the loop filter flags and copy the original values for back up */
+ /* Luma */
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 loop_filter_bit_pos;
+ WORD32 log2_min_cu = 3;
+ WORD32 min_cu = (1 << log2_min_cu);
+ UWORD8 *pu1_src_tmp_luma = pu1_src_luma;
+ WORD32 sao_blk_ht = ctb_size - SAO_SHIFT_CTB;
+ WORD32 sao_blk_wd = ctb_size;
+ WORD32 remaining_rows;
+ WORD32 remaining_cols;
+
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+ if(remaining_rows <= SAO_SHIFT_CTB)
+ sao_blk_ht += remaining_rows;
+ if(remaining_cols <= SAO_SHIFT_CTB)
+ sao_blk_wd += remaining_cols;
+
+ pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB : 0;
+ pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+ pu1_src_backup_luma = ps_sao_ctxt->pu1_tmp_buf_luma;
+
+ loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+ (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+ if(ps_sao_ctxt->i4_ctb_x > 0)
+ loop_filter_bit_pos -= 1;
+
+ pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+ (loop_filter_bit_pos >> 3);
+
+ for(i = -(ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB : 0) >> log2_min_cu;
+ i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+ {
+ WORD32 tmp_wd = sao_blk_wd;
+
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+ (loop_filter_bit_pos & 7);
+ u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ no_loop_filter_enabled_luma = 1;
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ pu1_src_tmp_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ }
+ else
+ {
+ for(row = 0; row < min_cu; row++)
+ {
+ for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+ {
+ pu1_src_backup_luma[row * backup_strd + col] = pu1_src_tmp_luma[row * src_strd + col];
+ }
+ }
+ pu1_src_tmp_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ }
+ }
+
+ pu1_src_tmp_luma -= sao_blk_wd;
+ pu1_src_backup_luma -= sao_blk_wd;
+ }
+
+ pu1_src_tmp_luma += (src_strd << log2_min_cu);
+ pu1_src_backup_luma += (backup_strd << log2_min_cu);
+ }
+ }
+
+ /* Chroma */
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 loop_filter_bit_pos;
+ WORD32 log2_min_cu = 3;
+ WORD32 min_cu = (1 << log2_min_cu);
+ UWORD8 *pu1_src_tmp_chroma = pu1_src_chroma;
+ WORD32 sao_blk_ht = ctb_size - 2 * SAO_SHIFT_CTB;
+ WORD32 sao_blk_wd = ctb_size;
+ WORD32 remaining_rows;
+ WORD32 remaining_cols;
+
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+ if(remaining_rows <= 2 * SAO_SHIFT_CTB)
+ sao_blk_ht += remaining_rows;
+ if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+ sao_blk_wd += remaining_cols;
+
+ pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB * 2 : 0;
+ pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+ pu1_src_backup_chroma = ps_sao_ctxt->pu1_tmp_buf_chroma;
+
+ loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+ (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+ if(ps_sao_ctxt->i4_ctb_x > 0)
+ loop_filter_bit_pos -= 2;
+
+ pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+ (loop_filter_bit_pos >> 3);
+
+ for(i = -(ps_sao_ctxt->i4_ctb_y ? 2 * SAO_SHIFT_CTB : 0) >> log2_min_cu;
+ i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+ {
+ WORD32 tmp_wd = sao_blk_wd;
+
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+ (loop_filter_bit_pos & 7);
+ u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ no_loop_filter_enabled_chroma = 1;
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ pu1_src_tmp_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ }
+ else
+ {
+ for(row = 0; row < min_cu / 2; row++)
+ {
+ for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+ {
+ pu1_src_backup_chroma[row * backup_strd + col] = pu1_src_tmp_chroma[row * src_strd + col];
+ }
+ }
+
+ pu1_src_tmp_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ }
+ }
+
+ pu1_src_tmp_chroma -= sao_blk_wd;
+ pu1_src_backup_chroma -= sao_blk_wd;
+ }
+
+ pu1_src_tmp_chroma += ((src_strd / 2) << log2_min_cu);
+ pu1_src_backup_chroma += ((backup_strd / 2) << log2_min_cu);
+ }
+ }
+ }
+
+ DEBUG_PROCESS_TMP_BUF(ps_sao_ctxt->pu1_tmp_buf_luma, ps_sao_ctxt->pu1_tmp_buf_chroma);
+
+ /* Top-left CTB */
+ if(ps_sao_ctxt->i4_ctb_x > 0 && ps_sao_ctxt->i4_ctb_y > 0)
+ {
+ WORD32 sao_wd_luma = SAO_SHIFT_CTB;
+ WORD32 sao_wd_chroma = 2 * SAO_SHIFT_CTB;
+ WORD32 sao_ht_luma = SAO_SHIFT_CTB;
+ WORD32 sao_ht_chroma = SAO_SHIFT_CTB;
+
+ WORD32 ctbx_tl_t = 0, ctbx_tl_l = 0, ctbx_tl_r = 0, ctbx_tl_d = 0, ctbx_tl = 0;
+ WORD32 ctby_tl_t = 0, ctby_tl_l = 0, ctby_tl_r = 0, ctby_tl_d = 0, ctby_tl = 0;
+ WORD32 au4_idx_tl[8], idx_tl;
+
+
+ pu1_src_luma -= (sao_wd_luma + sao_ht_luma * src_strd);
+ pu1_src_chroma -= (sao_wd_chroma + sao_ht_chroma * src_strd);
+ ps_sao -= (1 + ps_sps->i2_pic_wd_in_ctb);
+ pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma;
+ pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma;
+ pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma;
+ pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - (2 * sao_ht_chroma);
+
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+ {
+ if(0 == ps_sao->b3_y_type_idx)
+ {
+ /* Update left, top and top-left */
+ for(row = 0; row < sao_ht_luma; row++)
+ {
+ pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+ }
+ pu1_sao_src_luma_top_left_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+
+ }
+
+ else if(1 == ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_luma_top_left_ctb,
+ ps_sao->b5_y_band_pos,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma
+ );
+ }
+
+ else // if(2 <= ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_luma[i] = 255;
+ au1_tile_slice_boundary[i] = 0;
+ au4_idx_tl[i] = 0;
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+
+ /******************************************************************
+ * Derive the Top-left CTB's neighbor pixel's slice indices.
+ *
+ * TL_T
+ * 4 _2__5________
+ * 0 | | |
+ * TL_L | TL | 1 TL_R|
+ * |____|_______|____
+ * 6|TL_D|7 | |
+ * | 3 | | |
+ * |____|_______| |
+ * | |
+ * | |
+ * |____________|
+ *
+ *****************************************************************/
+
+ /*In case of slices, unless we encounter multiple slice/tiled clips, don't enter*/
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+ {
+ /*Assuming that sao shift is uniform along x and y directions*/
+ if((0 == (1 << log2_ctb_size) - sao_wd_luma) && (ps_sao_ctxt->i4_ctb_y > 1) && (ps_sao_ctxt->i4_ctb_x > 1))
+ {
+ ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 2;
+ ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 2;
+ }
+ else if(!(0 == (1 << log2_ctb_size) - sao_wd_luma))
+ {
+ ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 1;
+ ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 1;
+ }
+ ctbx_tl_t = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_tl_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_tl_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_tl_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_tl_d = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_tl_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_tl = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_tl = ps_sao_ctxt->i4_ctb_y - 1;
+ }
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ /*Calculate slice indices for neighbor pixels*/
+ idx_tl = pu1_slice_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_slice_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+ au4_idx_tl[0] = pu1_slice_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[1] = au4_idx_tl[5] = pu1_slice_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[3] = au4_idx_tl[6] = pu1_slice_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[7] = pu1_slice_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ if((0 == (1 << log2_ctb_size) - sao_wd_luma))
+ {
+ if(ps_sao_ctxt->i4_ctb_x == 1)
+ {
+ au4_idx_tl[6] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[6] = pu1_slice_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ if(ps_sao_ctxt->i4_ctb_y == 1)
+ {
+ au4_idx_tl[5] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[5] = pu1_slice_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[4] = pu1_slice_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ au4_idx_tl[7] = pu1_slice_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ /* Verify that the neighbor ctbs dont cross pic boundary.
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags.
+ * Hence, for top and left pixels, current ctb flag is checked. For right and down pixels,
+ * the respective pixel's flags are checked
+ */
+
+ if((0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma))
+ {
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[6] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_tl[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+ }
+ if((0 == (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma))
+ {
+ au4_ilf_across_tile_slice_enable[5] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[4] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ }
+ au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_tl[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_tl[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_tl[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags.
+ * Hence, for top and left pixels, current ctb flag is checked. For right and down pixels,
+ * the respective pixel's flags are checked
+ */
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_tl[i] != idx_tl)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr((UWORD8 *)au4_idx_tl, 0, 8 * sizeof(WORD32));
+ }
+
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ /*Set the boundary arrays*/
+ /*Calculate tile indices for neighbor pixels*/
+ idx_tl = pu1_tile_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_tile_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+ au4_idx_tl[0] = pu1_tile_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[1] = au4_idx_tl[5] = pu1_tile_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[3] = au4_idx_tl[6] = pu1_tile_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[7] = pu1_tile_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ if((0 == (1 << log2_ctb_size) - sao_wd_luma))
+ {
+ if(ps_sao_ctxt->i4_ctb_x == 1)
+ {
+ au4_idx_tl[6] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[6] = pu1_tile_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ if(ps_sao_ctxt->i4_ctb_y == 1)
+ {
+ au4_idx_tl[5] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[5] = pu1_tile_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[4] = pu1_tile_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ au4_idx_tl[7] = pu1_tile_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the tile boundary*/
+ if(au4_idx_tl[i] != idx_tl)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+ }
+ }
+ }
+ }
+ }
+
+
+ /*Set availability flags based on tile and slice boundaries*/
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_luma[i] = 0;
+ }
+ }
+ }
+ }
+
+ if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma)
+ {
+ au1_avail_luma[0] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[6] = 0;
+ }
+
+ if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_luma[1] = 0;
+ au1_avail_luma[5] = 0;
+ au1_avail_luma[7] = 0;
+ }
+ //y==1 case
+ if((0 == (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma))
+ {
+ au1_avail_luma[2] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[5] = 0;
+ }
+ if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_luma[3] = 0;
+ au1_avail_luma[6] = 0;
+ au1_avail_luma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_src_top_luma[sao_wd_luma];
+ u1_sao_src_top_left_luma_bot_left = pu1_src_left_luma[sao_ht_luma];
+ ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_luma_top_left_ctb,
+ au1_src_top_right,
+ &u1_sao_src_top_left_luma_bot_left,
+ au1_avail_luma,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma);
+ }
+ }
+
+ }
+
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+ {
+ if(0 == ps_sao->b3_cb_type_idx)
+ {
+ for(row = 0; row < sao_ht_chroma; row++)
+ {
+ pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+ pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+ }
+ pu1_sao_src_chroma_top_left_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+ pu1_sao_src_chroma_top_left_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+
+ }
+
+ else if(1 == ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ ps_sao->b5_cr_band_pos,
+ ps_sao->b5_cb_band_pos,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ ps_sao->b5_cb_band_pos,
+ ps_sao->b5_cr_band_pos,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ }
+
+ else // if(2 <= ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_chroma[i] = 255;
+ au1_tile_slice_boundary[i] = 0;
+ au4_idx_tl[i] = 0;
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ /*In case of slices*/
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+ if((0 == (1 << log2_ctb_size) - sao_wd_chroma) && (ps_sao_ctxt->i4_ctb_y > 1) && (ps_sao_ctxt->i4_ctb_x > 1))
+ {
+ ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 2;
+ ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 2;
+ }
+ else if(!(0 == (1 << log2_ctb_size) - sao_wd_chroma))
+ {
+ ctby_tl_t = ps_sao_ctxt->i4_ctb_y - 1;
+ ctbx_tl_l = ps_sao_ctxt->i4_ctb_x - 1;
+ }
+ ctbx_tl_t = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_tl_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_tl_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_tl_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_tl_d = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_tl_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_tl = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_tl = ps_sao_ctxt->i4_ctb_y - 1;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+
+ idx_tl = pu1_slice_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_slice_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+ au4_idx_tl[0] = pu1_slice_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[1] = au4_idx_tl[5] = pu1_slice_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[3] = au4_idx_tl[6] = pu1_slice_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[7] = pu1_slice_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ if((0 == (1 << log2_ctb_size) - sao_wd_chroma))
+ {
+ if(ps_sao_ctxt->i4_ctb_x == 1)
+ {
+ au4_idx_tl[6] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[6] = pu1_slice_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ if(ps_sao_ctxt->i4_ctb_y == 1)
+ {
+ au4_idx_tl[5] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[5] = pu1_slice_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[4] = pu1_slice_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ au4_idx_tl[7] = pu1_slice_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ /* Verify that the neighbor ctbs don't cross pic boundary
+ * Also, the ILF flag belonging to the higher pixel address (between neighbor and current pixels) must be assigned*/
+ if((0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma))
+ {
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[6] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_tl[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+ }
+ if((0 == (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) - sao_ht_chroma))
+ {
+ au4_ilf_across_tile_slice_enable[5] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[4] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ }
+ au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_tl)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_tl[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_tl[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_tl[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags
+ */
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_tl[i] != idx_tl)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+
+ /*Reset indices*/
+ for(i = 0; i < 8; i++)
+ {
+ au4_idx_tl[i] = 0;
+ }
+ }
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ /*Set the boundary arrays*/
+ /*Calculate tile indices for neighbor pixels*/
+ idx_tl = pu1_tile_idx[ctbx_tl + (ctby_tl * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[2] = au4_idx_tl[4] = *(pu1_tile_idx + ctbx_tl_t + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb));
+ au4_idx_tl[0] = pu1_tile_idx[ctbx_tl_l + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[1] = au4_idx_tl[5] = pu1_tile_idx[ctbx_tl_r + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[3] = au4_idx_tl[6] = pu1_tile_idx[ctbx_tl_d + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[7] = pu1_tile_idx[ctbx_tl_d + 1 + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ if((0 == (1 << log2_ctb_size) - sao_wd_luma))
+ {
+ if(ps_sao_ctxt->i4_ctb_x == 1)
+ {
+ au4_idx_tl[6] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[6] = pu1_tile_idx[(ctbx_tl_d - 1) + (ctby_tl_r * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ if(ps_sao_ctxt->i4_ctb_y == 1)
+ {
+ au4_idx_tl[5] = -1;
+ au4_idx_tl[4] = -1;
+ }
+ else
+ {
+ au4_idx_tl[5] = pu1_tile_idx[(ctbx_tl_l + 1) + (ctby_tl_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_tl[4] = pu1_tile_idx[(ctbx_tl_t - 1) + (ctby_tl_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ au4_idx_tl[7] = pu1_tile_idx[(ctbx_tl_d + 1) + (ctby_tl_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the tile boundary*/
+ if(au4_idx_tl[i] != idx_tl)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+ }
+ }
+ }
+ }
+ }
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_chroma[i] = 0;
+ }
+ }
+ }
+ }
+
+ if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma)
+ {
+ au1_avail_chroma[0] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[6] = 0;
+ }
+ if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_chroma[1] = 0;
+ au1_avail_chroma[5] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ if(0 == (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) - sao_ht_chroma)
+ {
+ au1_avail_chroma[2] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[5] = 0;
+ }
+ if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_chroma[3] = 0;
+ au1_avail_chroma[6] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_src_top_chroma[sao_wd_chroma];
+ au1_src_top_right[1] = pu1_src_top_chroma[sao_wd_chroma + 1];
+ au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_left_chroma[2 * sao_ht_chroma];
+ au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_left_chroma[2 * sao_ht_chroma + 1];
+ if((ctb_size == 16) && (ps_sao_ctxt->i4_ctb_y != ps_sps->i2_pic_ht_in_ctb - 1))
+ {
+ au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+ au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+ }
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ au1_src_top_right,
+ au1_sao_src_top_left_chroma_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ else
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ au1_src_top_right,
+ au1_sao_src_top_left_chroma_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ }
+ }
+ }
+
+ pu1_src_luma += sao_wd_luma + sao_ht_luma * src_strd;
+ pu1_src_chroma += sao_wd_chroma + sao_ht_chroma * src_strd;
+ ps_sao += (1 + ps_sps->i2_pic_wd_in_ctb);
+ }
+
+
+ /* Top CTB */
+ if((ps_sao_ctxt->i4_ctb_y > 0))
+ {
+ WORD32 sao_wd_luma = ctb_size - SAO_SHIFT_CTB;
+ WORD32 sao_wd_chroma = ctb_size - 2 * SAO_SHIFT_CTB;
+ WORD32 sao_ht_luma = SAO_SHIFT_CTB;
+ WORD32 sao_ht_chroma = SAO_SHIFT_CTB;
+
+ WORD32 ctbx_t_t = 0, ctbx_t_l = 0, ctbx_t_r = 0, ctbx_t_d = 0, ctbx_t = 0;
+ WORD32 ctby_t_t = 0, ctby_t_l = 0, ctby_t_r = 0, ctby_t_d = 0, ctby_t = 0;
+ WORD32 au4_idx_t[8], idx_t;
+
+ WORD32 remaining_cols;
+
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_luma);
+ if(remaining_cols <= SAO_SHIFT_CTB)
+ {
+ sao_wd_luma += remaining_cols;
+ }
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_chroma);
+ if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+ {
+ sao_wd_chroma += remaining_cols;
+ }
+
+ pu1_src_luma -= (sao_ht_luma * src_strd);
+ pu1_src_chroma -= (sao_ht_chroma * src_strd);
+ ps_sao -= (ps_sps->i2_pic_wd_in_ctb);
+ pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+ pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+ pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_chroma;
+ pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - (2 * sao_ht_chroma);
+
+ if(0 != sao_wd_luma)
+ {
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+ {
+ if(0 == ps_sao->b3_y_type_idx)
+ {
+ /* Update left, top and top-left */
+ for(row = 0; row < sao_ht_luma; row++)
+ {
+ pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+ }
+ pu1_sao_src_luma_top_left_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+ }
+
+ else if(1 == ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_luma_top_left_ctb,
+ ps_sao->b5_y_band_pos,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma
+ );
+ }
+
+ else // if(2 <= ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr(au1_avail_luma, 255, 8);
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr(au1_tile_slice_boundary, 0, 8);
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr((UWORD8 *)au4_idx_t, 0, 8 * sizeof(WORD32));
+
+ for(i = 0; i < 8; i++)
+ {
+
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ /******************************************************************
+ * Derive the Top-left CTB's neighbor pixel's slice indices.
+ *
+ * T_T
+ * ____________
+ * | | |
+ * | T_L| T |T_R
+ * | | ______|____
+ * | | T_D | |
+ * | | | |
+ * |____|_______| |
+ * | |
+ * | |
+ * |____________|
+ *
+ *****************************************************************/
+
+ /*In case of slices*/
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+
+ ctbx_t_t = ps_sao_ctxt->i4_ctb_x;
+ ctby_t_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_t_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_t_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_t_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_t_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_t_d = ps_sao_ctxt->i4_ctb_x;
+ ctby_t_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_t = ps_sao_ctxt->i4_ctb_x;
+ ctby_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ /*Calculate neighbor ctb slice indices*/
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_t[0] = -1;
+ au4_idx_t[6] = -1;
+ au4_idx_t[4] = -1;
+ }
+ else
+ {
+ au4_idx_t[0] = au4_idx_t[4] = pu1_slice_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[6] = pu1_slice_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_t = pu1_slice_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[2] = au4_idx_t[5] = pu1_slice_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[1] = pu1_slice_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[3] = au4_idx_t[7] = pu1_slice_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ /*Verify that the neighbor ctbs don't cross pic boundary.*/
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[6] = 0;
+ au4_ilf_across_tile_slice_enable[0] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_t[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+ }
+
+
+
+ au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_t[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_t[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_t[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags
+ */
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_t[i] != idx_t)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ /*Check for slice flag at such boundaries*/
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+ /*Reset indices*/
+ for(i = 0; i < 8; i++)
+ {
+ au4_idx_t[i] = 0;
+ }
+ }
+
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ /*Calculate neighbor ctb slice indices*/
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_t[0] = -1;
+ au4_idx_t[6] = -1;
+ au4_idx_t[4] = -1;
+ }
+ else
+ {
+ au4_idx_t[0] = au4_idx_t[4] = pu1_tile_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[6] = pu1_tile_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_t = pu1_tile_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[2] = au4_idx_t[5] = pu1_tile_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[1] = pu1_tile_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[3] = au4_idx_t[7] = pu1_tile_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the tile boundary*/
+ if(au4_idx_t[i] != idx_t)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+ }
+ }
+ }
+ }
+ }
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_luma[i] = 0;
+ }
+ }
+ }
+ }
+
+
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_luma[0] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[6] = 0;
+ }
+
+ if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_luma)
+ {
+ au1_avail_luma[1] = 0;
+ au1_avail_luma[5] = 0;
+ au1_avail_luma[7] = 0;
+ }
+
+ if(0 == (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) - sao_ht_luma)
+ {
+ au1_avail_luma[2] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[5] = 0;
+ }
+
+ if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_luma[3] = 0;
+ au1_avail_luma[6] = 0;
+ au1_avail_luma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_sao_src_top_left_luma_top_right[0];
+ u1_sao_src_top_left_luma_bot_left = pu1_src_luma[sao_ht_luma * src_strd - 1];
+ ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_luma_top_left_ctb,
+ au1_src_top_right,
+ &u1_sao_src_top_left_luma_bot_left,
+ au1_avail_luma,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma);
+ }
+ }
+ }
+ }
+
+ if(0 != sao_wd_chroma)
+ {
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+ {
+ if(0 == ps_sao->b3_cb_type_idx)
+ {
+
+ for(row = 0; row < sao_ht_chroma; row++)
+ {
+ pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+ pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+ }
+ pu1_sao_src_chroma_top_left_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+ pu1_sao_src_chroma_top_left_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+
+ }
+
+ else if(1 == ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ ps_sao->b5_cr_band_pos,
+ ps_sao->b5_cb_band_pos,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ ps_sao->b5_cb_band_pos,
+ ps_sao->b5_cr_band_pos,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ }
+ else // if(2 <= ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_chroma[i] = 255;
+ au1_tile_slice_boundary[i] = 0;
+ au4_idx_t[i] = 0;
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+ ctbx_t_t = ps_sao_ctxt->i4_ctb_x;
+ ctby_t_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_t_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_t_l = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_t_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_t_r = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_t_d = ps_sao_ctxt->i4_ctb_x;
+ ctby_t_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_t = ps_sao_ctxt->i4_ctb_x;
+ ctby_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_t[0] = -1;
+ au4_idx_t[6] = -1;
+ au4_idx_t[4] = -1;
+ }
+ else
+ {
+ au4_idx_t[0] = au4_idx_t[4] = pu1_slice_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[6] = pu1_slice_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_t = pu1_slice_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[2] = au4_idx_t[5] = pu1_slice_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[1] = pu1_slice_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[3] = au4_idx_t[7] = pu1_slice_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ /*Verify that the neighbor ctbs don't cross pic boundary.*/
+
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[6] = 0;
+ au4_ilf_across_tile_slice_enable[0] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_t[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+ }
+
+ au4_ilf_across_tile_slice_enable[5] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_t)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_t[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_t[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_t[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags
+ */
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_t[i] != idx_t)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ }
+ else
+ {
+ /*Indicates that the neighbour belongs to same/dependent slice*/
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+ /*Reset indices*/
+ for(i = 0; i < 8; i++)
+ {
+ au4_idx_t[i] = 0;
+ }
+ }
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ /*Calculate neighbor ctb slice indices*/
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_t[0] = -1;
+ au4_idx_t[6] = -1;
+ au4_idx_t[4] = -1;
+ }
+ else
+ {
+ au4_idx_t[0] = au4_idx_t[4] = pu1_tile_idx[ctbx_t_l + (ctby_t_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[6] = pu1_tile_idx[ctbx_t_d - 1 + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_t = pu1_tile_idx[ctbx_t + (ctby_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[2] = au4_idx_t[5] = pu1_tile_idx[ctbx_t_t + (ctby_t_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[1] = pu1_tile_idx[ctbx_t_r + (ctby_t_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_t[3] = au4_idx_t[7] = pu1_tile_idx[ctbx_t_d + (ctby_t_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the tile boundary*/
+ if(au4_idx_t[i] != idx_t)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+ }
+ }
+ }
+ }
+ }
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_chroma[i] = 0;
+ }
+ }
+
+ }
+ }
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_chroma[0] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[6] = 0;
+ }
+
+ if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_chroma)
+ {
+ au1_avail_chroma[1] = 0;
+ au1_avail_chroma[5] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ if(0 == (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) - sao_ht_chroma)
+ {
+ au1_avail_chroma[2] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[5] = 0;
+ }
+
+ if(ps_sps->i2_pic_ht_in_ctb == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_chroma[3] = 0;
+ au1_avail_chroma[6] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_sao_src_top_left_chroma_top_right[0];
+ au1_src_top_right[1] = pu1_sao_src_top_left_chroma_top_right[1];
+ au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+ au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ au1_src_top_right,
+ au1_sao_src_top_left_chroma_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ else
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_chroma_top_left_ctb,
+ au1_src_top_right,
+ au1_sao_src_top_left_chroma_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ }
+
+ }
+ }
+ }
+
+ pu1_src_luma += sao_ht_luma * src_strd;
+ pu1_src_chroma += sao_ht_chroma * src_strd;
+ ps_sao += (ps_sps->i2_pic_wd_in_ctb);
+ }
+
+ /* Left CTB */
+ if(ps_sao_ctxt->i4_ctb_x > 0)
+ {
+ WORD32 sao_wd_luma = SAO_SHIFT_CTB;
+ WORD32 sao_wd_chroma = 2 * SAO_SHIFT_CTB;
+ WORD32 sao_ht_luma = ctb_size - SAO_SHIFT_CTB;
+ WORD32 sao_ht_chroma = ctb_size / 2 - SAO_SHIFT_CTB;
+
+ WORD32 ctbx_l_t = 0, ctbx_l_l = 0, ctbx_l_r = 0, ctbx_l_d = 0, ctbx_l = 0;
+ WORD32 ctby_l_t = 0, ctby_l_l = 0, ctby_l_r = 0, ctby_l_d = 0, ctby_l = 0;
+ WORD32 au4_idx_l[8], idx_l;
+
+ WORD32 remaining_rows;
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + sao_ht_luma);
+ if(remaining_rows <= SAO_SHIFT_CTB)
+ {
+ sao_ht_luma += remaining_rows;
+ }
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples / 2 - ((ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) + sao_ht_chroma);
+ if(remaining_rows <= SAO_SHIFT_CTB)
+ {
+ sao_ht_chroma += remaining_rows;
+ }
+
+ pu1_src_luma -= sao_wd_luma;
+ pu1_src_chroma -= sao_wd_chroma;
+ ps_sao -= 1;
+ pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma;
+ pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma;
+ pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+ pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+
+
+ if(0 != sao_ht_luma)
+ {
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+ {
+ if(0 == ps_sao->b3_y_type_idx)
+ {
+ /* Update left, top and top-left */
+ for(row = 0; row < sao_ht_luma; row++)
+ {
+ pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+ }
+ /*Update in next location*/
+ pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+ }
+
+ else if(1 == ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_top_left_luma_curr_ctb,
+ ps_sao->b5_y_band_pos,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma
+ );
+ }
+
+ else // if(2 <= ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_luma[i] = 255;
+ au1_tile_slice_boundary[i] = 0;
+ au4_idx_l[i] = 0;
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ /******************************************************************
+ * Derive the Top-left CTB's neighbour pixel's slice indices.
+ *
+ *
+ * ____________
+ * | | |
+ * | L_T| |
+ * |____|_______|____
+ * | | | |
+ * L_L | L | L_R | |
+ * |____|_______| |
+ * | |
+ * L_D | |
+ * |____________|
+ *
+ *****************************************************************/
+
+ /*In case of slices or tiles*/
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+ ctbx_l_t = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_l_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l_l = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_l_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_l_r = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_l_d = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l = ps_sao_ctxt->i4_ctb_y;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_l[2] = -1;
+ au4_idx_l[4] = -1;
+ au4_idx_l[5] = -1;
+ }
+ else
+ {
+ au4_idx_l[2] = au4_idx_l[4] = pu1_slice_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[5] = pu1_slice_idx[ctbx_l_t + 1 + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_l = au4_idx_l[6] = pu1_slice_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[0] = pu1_slice_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[1] = au4_idx_l[7] = pu1_slice_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[3] = pu1_slice_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ /*Verify that the neighbor ctbs don't cross pic boundary.*/
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_ilf_across_tile_slice_enable[2] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[5] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+
+ }
+ //TODO: ILF flag checks for [0] and [6] is missing.
+ au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_l[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_l[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_l[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags
+ */
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_l[i] != idx_l)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+ /*Reset indices*/
+ for(i = 0; i < 8; i++)
+ {
+ au4_idx_l[i] = 0;
+ }
+ }
+
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_l[2] = -1;
+ au4_idx_l[4] = -1;
+ au4_idx_l[5] = -1;
+ }
+ else
+ {
+ au4_idx_l[2] = au4_idx_l[4] = pu1_tile_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[5] = pu1_tile_idx[ctbx_l_t + 1 + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ idx_l = au4_idx_l[6] = pu1_tile_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[0] = pu1_tile_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[1] = au4_idx_l[7] = pu1_tile_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[3] = pu1_tile_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_l[i] != idx_l)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag;
+ }
+ }
+ }
+ }
+ }
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_luma[i] = 0;
+ }
+ }
+ }
+ }
+ if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_luma)
+ {
+ au1_avail_luma[0] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[6] = 0;
+ }
+ if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_luma[1] = 0;
+ au1_avail_luma[5] = 0;
+ au1_avail_luma[7] = 0;
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_luma[2] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[5] = 0;
+ }
+
+ if(ps_sps->i2_pic_height_in_luma_samples - (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) <= sao_ht_luma)
+ {
+ au1_avail_luma[3] = 0;
+ au1_avail_luma[6] = 0;
+ au1_avail_luma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_src_top_luma[sao_wd_luma];
+ u1_sao_src_top_left_luma_bot_left = pu1_sao_src_top_left_luma_bot_left[0];
+ ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_top_left_luma_curr_ctb,
+ au1_src_top_right,
+ &u1_sao_src_top_left_luma_bot_left,
+ au1_avail_luma,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma);
+ }
+
+ }
+ }
+ }
+
+ if(0 != sao_ht_chroma)
+ {
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+ {
+ if(0 == ps_sao->b3_cb_type_idx)
+ {
+ for(row = 0; row < sao_ht_chroma; row++)
+ {
+ pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+ pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+ }
+ pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+ pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+ }
+
+ else if(1 == ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ ps_sao->b5_cr_band_pos,
+ ps_sao->b5_cb_band_pos,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ ps_sao->b5_cb_band_pos,
+ ps_sao->b5_cr_band_pos,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ }
+
+ else // if(2 <= ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_chroma[i] = 255;
+ au1_tile_slice_boundary[i] = 0;
+ au4_idx_l[i] = 0;
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ /*In case of slices*/
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+ ctbx_l_t = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_l_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l_l = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_l_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_l_r = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_l_d = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_l = ps_sao_ctxt->i4_ctb_y;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_l[2] = -1;
+ au4_idx_l[4] = -1;
+ au4_idx_l[5] = -1;
+ }
+ else
+ {
+ au4_idx_l[2] = au4_idx_l[4] = pu1_slice_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[5] = pu1_slice_idx[ctbx_l_t + 1 + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_l = au4_idx_l[6] = pu1_slice_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[0] = pu1_slice_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[1] = au4_idx_l[7] = pu1_slice_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[3] = pu1_slice_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ /*Verify that the neighbour ctbs dont cross pic boundary.*/
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_ilf_across_tile_slice_enable[2] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[5] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+ }
+ // au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_l)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_l[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_l[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_l[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags
+ */
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_l[i] != idx_l)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+ /*Reset indices*/
+ for(i = 0; i < 8; i++)
+ {
+ au4_idx_l[i] = 0;
+ }
+ }
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_l[2] = -1;
+ au4_idx_l[4] = -1;
+ au4_idx_l[5] = -1;
+ }
+ else
+ {
+ au4_idx_l[2] = au4_idx_l[4] = pu1_tile_idx[ctbx_l_t + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[5] = pu1_tile_idx[ctbx_l_t + 1 + (ctby_l_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ idx_l = au4_idx_l[6] = pu1_tile_idx[ctbx_l + (ctby_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[0] = pu1_tile_idx[ctbx_l_l + (ctby_l_l * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[1] = au4_idx_l[7] = pu1_tile_idx[ctbx_l_r + (ctby_l_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_l[3] = pu1_tile_idx[ctbx_l_d + (ctby_l_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_l[i] != idx_l)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+ }
+ }
+ }
+ }
+ }
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_chroma[i] = 0;
+ }
+ }
+ }
+ }
+ if(0 == (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) - sao_wd_chroma)
+ {
+ au1_avail_chroma[0] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[6] = 0;
+ }
+
+ if(ps_sps->i2_pic_wd_in_ctb == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_chroma[1] = 0;
+ au1_avail_chroma[5] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_chroma[2] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[5] = 0;
+ }
+
+ if(ps_sps->i2_pic_height_in_luma_samples / 2 - (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) <= sao_ht_chroma)
+ {
+ au1_avail_chroma[3] = 0;
+ au1_avail_chroma[6] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_src_top_chroma[sao_wd_chroma];
+ au1_src_top_right[1] = pu1_src_top_chroma[sao_wd_chroma + 1];
+ au1_src_bot_left[0] = pu1_sao_src_top_left_chroma_bot_left[0];
+ au1_src_bot_left[1] = pu1_sao_src_top_left_chroma_bot_left[1];
+ //au1_src_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+ //au1_src_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+ if((ctb_size == 16) && (ps_sao_ctxt->i4_ctb_x != ps_sps->i2_pic_wd_in_ctb - 1))
+ {
+ au1_src_top_right[0] = pu1_src_chroma[sao_wd_chroma - src_strd];
+ au1_src_top_right[1] = pu1_src_chroma[sao_wd_chroma - src_strd + 1];
+ }
+
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ au1_src_top_right,
+ au1_src_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ else
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ au1_src_top_right,
+ au1_src_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ }
+
+ }
+ }
+
+ }
+ pu1_src_luma += sao_wd_luma;
+ pu1_src_chroma += sao_wd_chroma;
+ ps_sao += 1;
+ }
+
+
+ /* Current CTB */
+ {
+ WORD32 sao_wd_luma = ctb_size - SAO_SHIFT_CTB;
+ WORD32 sao_wd_chroma = ctb_size - SAO_SHIFT_CTB * 2;
+ WORD32 sao_ht_luma = ctb_size - SAO_SHIFT_CTB;
+ WORD32 sao_ht_chroma = ctb_size / 2 - SAO_SHIFT_CTB;
+ WORD32 ctbx_c_t = 0, ctbx_c_l = 0, ctbx_c_r = 0, ctbx_c_d = 0, ctbx_c = 0;
+ WORD32 ctby_c_t = 0, ctby_c_l = 0, ctby_c_r = 0, ctby_c_d = 0, ctby_c = 0;
+ WORD32 au4_idx_c[8], idx_c;
+
+ WORD32 remaining_rows;
+ WORD32 remaining_cols;
+
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_luma);
+ if(remaining_cols <= SAO_SHIFT_CTB)
+ {
+ sao_wd_luma += remaining_cols;
+ }
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + sao_wd_chroma);
+ if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+ {
+ sao_wd_chroma += remaining_cols;
+ }
+
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + sao_ht_luma);
+ if(remaining_rows <= SAO_SHIFT_CTB)
+ {
+ sao_ht_luma += remaining_rows;
+ }
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples / 2 - ((ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) + sao_ht_chroma);
+ if(remaining_rows <= SAO_SHIFT_CTB)
+ {
+ sao_ht_chroma += remaining_rows;
+ }
+
+ pu1_src_top_luma = ps_sao_ctxt->pu1_sao_src_top_luma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+ pu1_src_top_chroma = ps_sao_ctxt->pu1_sao_src_top_chroma + (ps_sao_ctxt->i4_ctb_x << log2_ctb_size);
+ pu1_src_left_luma = ps_sao_ctxt->pu1_sao_src_left_luma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+ pu1_src_left_chroma = ps_sao_ctxt->pu1_sao_src_left_chroma + (ps_sao_ctxt->i4_ctb_y << log2_ctb_size);
+
+ if((0 != sao_wd_luma) && (0 != sao_ht_luma))
+ {
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag)
+ {
+ if(0 == ps_sao->b3_y_type_idx)
+ {
+ /* Update left, top and top-left */
+ for(row = 0; row < sao_ht_luma; row++)
+ {
+ pu1_src_left_luma[row] = pu1_src_luma[row * src_strd + (sao_wd_luma - 1)];
+ }
+ pu1_sao_src_top_left_luma_curr_ctb[0] = pu1_src_top_luma[sao_wd_luma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_luma, &pu1_src_luma[(sao_ht_luma - 1) * src_strd], sao_wd_luma);
+
+ pu1_sao_src_top_left_luma_top_right[0] = pu1_src_luma[(sao_ht_luma - 1) * src_strd + sao_wd_luma];
+
+ }
+
+ else if(1 == ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr(pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_top_left_luma_curr_ctb,
+ ps_sao->b5_y_band_pos,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma
+ );
+ }
+
+ else // if(2 <= ps_sao->b3_y_type_idx)
+ {
+ ai1_offset_y[1] = ps_sao->b4_y_offset_1;
+ ai1_offset_y[2] = ps_sao->b4_y_offset_2;
+ ai1_offset_y[3] = ps_sao->b4_y_offset_3;
+ ai1_offset_y[4] = ps_sao->b4_y_offset_4;
+
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_luma[i] = 255;
+ au1_tile_slice_boundary[i] = 0;
+ au4_idx_c[i] = 0;
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ /******************************************************************
+ * Derive the Top-left CTB's neighbour pixel's slice indices.
+ *
+ *
+ * ____________
+ * | | |
+ * | | C_T |
+ * |____|_______|____
+ * | | | |
+ * | C_L| C | C_R|
+ * |____|_______| |
+ * | C_D |
+ * | |
+ * |____________|
+ *
+ *****************************************************************/
+
+ /*In case of slices*/
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+ ctbx_c_t = ps_sao_ctxt->i4_ctb_x;
+ ctby_c_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_c_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_c_l = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_c_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_c_r = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_c_d = ps_sao_ctxt->i4_ctb_x;
+ ctby_c_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_c = ps_sao_ctxt->i4_ctb_x;
+ ctby_c = ps_sao_ctxt->i4_ctb_y;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_c[6] = -1;
+ au4_idx_c[0] = -1;
+ au4_idx_c[4] = -1;
+ }
+ else
+ {
+ au4_idx_c[0] = au4_idx_c[6] = pu1_slice_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_c[2] = -1;
+ au4_idx_c[5] = -1;
+ au4_idx_c[4] = -1;
+ }
+ else
+ {
+ au4_idx_c[4] = pu1_slice_idx[ctbx_c_t - 1 + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[2] = au4_idx_c[5] = pu1_slice_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_c = pu1_slice_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[1] = au4_idx_c[7] = pu1_slice_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[3] = pu1_slice_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_ilf_across_tile_slice_enable[6] = 0;
+ au4_ilf_across_tile_slice_enable[0] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[6] = (ps_slice_hdr_base + au4_idx_c[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[0] = (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;;
+ }
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_ilf_across_tile_slice_enable[2] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[5] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[2] = (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+ }
+ au4_ilf_across_tile_slice_enable[1] = (ps_slice_hdr_base + au4_idx_c[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] = (ps_slice_hdr_base + au4_idx_c[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] = (ps_slice_hdr_base + au4_idx_c[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags
+ */
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_c[i] != idx_c)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+ /*Reset indices*/
+ for(i = 0; i < 8; i++)
+ {
+ au4_idx_c[i] = 0;
+ }
+ }
+
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_c[6] = -1;
+ au4_idx_c[0] = -1;
+ au4_idx_c[4] = -1;
+ }
+ else
+ {
+ au4_idx_c[0] = au4_idx_c[6] = pu1_tile_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_c[2] = -1;
+ au4_idx_c[5] = -1;
+ au4_idx_c[4] = -1;
+ }
+ else
+ {
+ au4_idx_c[4] = pu1_tile_idx[ctbx_c_t - 1 + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[2] = au4_idx_c[5] = pu1_tile_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_c = pu1_tile_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[1] = au4_idx_c[7] = pu1_tile_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[3] = pu1_tile_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_c[i] != idx_c)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+ }
+ }
+ }
+ }
+ }
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_luma[i] = 0;
+ }
+ }
+
+ }
+ }
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_luma[0] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[6] = 0;
+ }
+
+ if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_luma)
+ {
+ au1_avail_luma[1] = 0;
+ au1_avail_luma[5] = 0;
+ au1_avail_luma[7] = 0;
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_luma[2] = 0;
+ au1_avail_luma[4] = 0;
+ au1_avail_luma[5] = 0;
+ }
+
+ if(ps_sps->i2_pic_height_in_luma_samples - (ps_sao_ctxt->i4_ctb_y << log2_ctb_size) <= sao_ht_luma)
+ {
+ au1_avail_luma[3] = 0;
+ au1_avail_luma[6] = 0;
+ au1_avail_luma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_src_luma[sao_wd_luma - src_strd];
+ u1_sao_src_top_left_luma_bot_left = pu1_src_luma[sao_ht_luma * src_strd - 1];
+
+ ps_codec->apf_sao_luma[ps_sao->b3_y_type_idx - 2](pu1_src_luma,
+ src_strd,
+ pu1_src_left_luma,
+ pu1_src_top_luma,
+ pu1_sao_src_top_left_luma_curr_ctb,
+ au1_src_top_right,
+ &u1_sao_src_top_left_luma_bot_left,
+ au1_avail_luma,
+ ai1_offset_y,
+ sao_wd_luma,
+ sao_ht_luma);
+ }
+ pu1_sao_src_top_left_luma_top_right[0] = pu1_src_luma[(sao_ht_luma - 1) * src_strd + sao_wd_luma];
+ pu1_sao_src_top_left_luma_bot_left[0] = pu1_src_luma[(sao_ht_luma)*src_strd + sao_wd_luma - 1];
+ }
+ }
+ }
+
+ if((0 != sao_wd_chroma) && (0 != sao_ht_chroma))
+ {
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag)
+ {
+ if(0 == ps_sao->b3_cb_type_idx)
+ {
+ for(row = 0; row < sao_ht_chroma; row++)
+ {
+ pu1_src_left_chroma[2 * row] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 2)];
+ pu1_src_left_chroma[2 * row + 1] = pu1_src_chroma[row * src_strd + (sao_wd_chroma - 1)];
+ }
+ pu1_sao_src_top_left_chroma_curr_ctb[0] = pu1_src_top_chroma[sao_wd_chroma - 2];
+ pu1_sao_src_top_left_chroma_curr_ctb[1] = pu1_src_top_chroma[sao_wd_chroma - 1];
+
+ ps_codec->s_func_selector.ihevc_memcpy_fptr(pu1_src_top_chroma, &pu1_src_chroma[(sao_ht_chroma - 1) * src_strd], sao_wd_chroma);
+
+ pu1_sao_src_top_left_chroma_top_right[0] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma];
+ pu1_sao_src_top_left_chroma_top_right[1] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma + 1];
+ }
+
+ else if(1 == ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ ps_sao->b5_cr_band_pos,
+ ps_sao->b5_cb_band_pos,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ else
+ {
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr(pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ ps_sao->b5_cb_band_pos,
+ ps_sao->b5_cr_band_pos,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma
+ );
+ }
+ }
+
+ else // if(2 <= ps_sao->b3_cb_type_idx)
+ {
+ ai1_offset_cb[1] = ps_sao->b4_cb_offset_1;
+ ai1_offset_cb[2] = ps_sao->b4_cb_offset_2;
+ ai1_offset_cb[3] = ps_sao->b4_cb_offset_3;
+ ai1_offset_cb[4] = ps_sao->b4_cb_offset_4;
+
+ ai1_offset_cr[1] = ps_sao->b4_cr_offset_1;
+ ai1_offset_cr[2] = ps_sao->b4_cr_offset_2;
+ ai1_offset_cr[3] = ps_sao->b4_cr_offset_3;
+ ai1_offset_cr[4] = ps_sao->b4_cr_offset_4;
+
+ for(i = 0; i < 8; i++)
+ {
+ au1_avail_chroma[i] = 255;
+ au1_tile_slice_boundary[i] = 0;
+ au4_idx_c[i] = 0;
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ {
+ if((!ps_slice_hdr->i1_first_slice_in_pic_flag) || (ps_pps->i1_tiles_enabled_flag))
+ {
+ ctbx_c_t = ps_sao_ctxt->i4_ctb_x;
+ ctby_c_t = ps_sao_ctxt->i4_ctb_y - 1;
+
+ ctbx_c_l = ps_sao_ctxt->i4_ctb_x - 1;
+ ctby_c_l = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_c_r = ps_sao_ctxt->i4_ctb_x;
+ ctby_c_r = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_c_d = ps_sao_ctxt->i4_ctb_x;
+ ctby_c_d = ps_sao_ctxt->i4_ctb_y;
+
+ ctbx_c = ps_sao_ctxt->i4_ctb_x;
+ ctby_c = ps_sao_ctxt->i4_ctb_y;
+
+ if(!ps_slice_hdr->i1_first_slice_in_pic_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_c[0] = -1;
+ au4_idx_c[4] = -1;
+ au4_idx_c[6] = -1;
+ }
+ else
+ {
+ au4_idx_c[0] = au4_idx_c[6] = pu1_slice_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_c[2] = -1;
+ au4_idx_c[4] = -1;
+ au4_idx_c[5] = -1;
+ }
+ else
+ {
+ au4_idx_c[2] = au4_idx_c[5] = pu1_slice_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[4] = pu1_slice_idx[ctbx_c_t - 1 + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_c = pu1_slice_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[1] = au4_idx_c[7] = pu1_slice_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[3] = pu1_slice_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_ilf_across_tile_slice_enable[0] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[6] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[6] &= (ps_slice_hdr_base + au4_idx_c[6])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[0] &= (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_ilf_across_tile_slice_enable[2] = 0;
+ au4_ilf_across_tile_slice_enable[4] = 0;
+ au4_ilf_across_tile_slice_enable[5] = 0;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[2] &= (ps_slice_hdr_base + idx_c)->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[5] = au4_ilf_across_tile_slice_enable[4] = au4_ilf_across_tile_slice_enable[2];
+ }
+
+ au4_ilf_across_tile_slice_enable[1] &= (ps_slice_hdr_base + au4_idx_c[1])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[3] &= (ps_slice_hdr_base + au4_idx_c[3])->i1_slice_loop_filter_across_slices_enabled_flag;
+ au4_ilf_across_tile_slice_enable[7] &= (ps_slice_hdr_base + au4_idx_c[7])->i1_slice_loop_filter_across_slices_enabled_flag;
+
+ /*
+ * Between each neighbor and the current CTB, the i1_slice_loop_filter_across_slices_enabled_flag
+ * of the pixel having a greater address is checked. Accordingly, set the availability flags
+ */
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_c[i] != idx_c)
+ {
+ au1_tile_slice_boundary[i] = 1;
+ }
+ else
+ {
+ au4_ilf_across_tile_slice_enable[i] = 1;
+ }
+ }
+ /*Reset indices*/
+ for(i = 0; i < 8; i++)
+ {
+ au4_idx_c[i] = 0;
+ }
+ }
+
+ if(ps_pps->i1_tiles_enabled_flag)
+ {
+ /* Calculate availability flags at slice boundary */
+ if(((ps_tile->u1_pos_x == ps_sao_ctxt->i4_ctb_x) || (ps_tile->u1_pos_y == ps_sao_ctxt->i4_ctb_y)) && (!((0 == ps_tile->u1_pos_x) && (0 == ps_tile->u1_pos_y))))
+ {
+ /*If ilf across tiles is enabled, boundary availability for tiles is not checked. */
+ if(!ps_pps->i1_loop_filter_across_tiles_enabled_flag)
+ {
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au4_idx_c[6] = -1;
+ au4_idx_c[0] = -1;
+ au4_idx_c[4] = -1;
+ }
+ else
+ {
+ au4_idx_c[0] = au4_idx_c[6] = pu1_tile_idx[ctbx_c_l + (ctby_c_l * ps_sps->i2_pic_wd_in_ctb)];
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au4_idx_c[2] = -1;
+ au4_idx_c[5] = -1;
+ au4_idx_c[4] = -1;
+ }
+ else
+ {
+ au4_idx_c[4] = pu1_tile_idx[ctbx_c_t - 1 + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[2] = au4_idx_c[5] = pu1_tile_idx[ctbx_c_t + (ctby_c_t * ps_sps->i2_pic_wd_in_ctb)];
+ }
+ idx_c = pu1_tile_idx[ctbx_c + (ctby_c * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[1] = au4_idx_c[7] = pu1_tile_idx[ctbx_c_r + (ctby_c_r * ps_sps->i2_pic_wd_in_ctb)];
+ au4_idx_c[3] = pu1_tile_idx[ctbx_c_d + (ctby_c_d * ps_sps->i2_pic_wd_in_ctb)];
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if(au4_idx_c[i] != idx_c)
+ {
+ au1_tile_slice_boundary[i] |= 1;
+ au4_ilf_across_tile_slice_enable[i] &= ps_pps->i1_loop_filter_across_tiles_enabled_flag; //=0
+ }
+ }
+ }
+ }
+ }
+
+ for(i = 0; i < 8; i++)
+ {
+ /*Sets the edges that lie on the slice/tile boundary*/
+ if((au1_tile_slice_boundary[i]) && !(au4_ilf_across_tile_slice_enable[i]))
+ {
+ au1_avail_chroma[i] = 0;
+ }
+ }
+ }
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_x)
+ {
+ au1_avail_chroma[0] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[6] = 0;
+ }
+
+ if(ps_sps->i2_pic_width_in_luma_samples - (ps_sao_ctxt->i4_ctb_x << log2_ctb_size) <= sao_wd_chroma)
+ {
+ au1_avail_chroma[1] = 0;
+ au1_avail_chroma[5] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ if(0 == ps_sao_ctxt->i4_ctb_y)
+ {
+ au1_avail_chroma[2] = 0;
+ au1_avail_chroma[4] = 0;
+ au1_avail_chroma[5] = 0;
+ }
+
+ if(ps_sps->i2_pic_height_in_luma_samples / 2 - (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 1)) <= sao_ht_chroma)
+ {
+ au1_avail_chroma[3] = 0;
+ au1_avail_chroma[6] = 0;
+ au1_avail_chroma[7] = 0;
+ }
+
+ {
+ au1_src_top_right[0] = pu1_src_chroma[sao_wd_chroma - src_strd];
+ au1_src_top_right[1] = pu1_src_chroma[sao_wd_chroma - src_strd + 1];
+
+ au1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[sao_ht_chroma * src_strd - 2];
+ au1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[sao_ht_chroma * src_strd - 1];
+
+ if(chroma_yuv420sp_vu)
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ au1_src_top_right,
+ au1_sao_src_top_left_chroma_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cr,
+ ai1_offset_cb,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ else
+ {
+ ps_codec->apf_sao_chroma[ps_sao->b3_cb_type_idx - 2](pu1_src_chroma,
+ src_strd,
+ pu1_src_left_chroma,
+ pu1_src_top_chroma,
+ pu1_sao_src_top_left_chroma_curr_ctb,
+ au1_src_top_right,
+ au1_sao_src_top_left_chroma_bot_left,
+ au1_avail_chroma,
+ ai1_offset_cb,
+ ai1_offset_cr,
+ sao_wd_chroma,
+ sao_ht_chroma);
+ }
+ }
+
+ }
+ pu1_sao_src_top_left_chroma_top_right[0] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma];
+ pu1_sao_src_top_left_chroma_top_right[1] = pu1_src_chroma[(sao_ht_chroma - 1) * src_strd + sao_wd_chroma + 1];
+
+ pu1_sao_src_top_left_chroma_bot_left[0] = pu1_src_chroma[(sao_ht_chroma)*src_strd + sao_wd_chroma - 2];
+ pu1_sao_src_top_left_chroma_bot_left[1] = pu1_src_chroma[(sao_ht_chroma)*src_strd + sao_wd_chroma - 1];
+ }
+
+ }
+ }
+
+
+
+
+/* If no loop filter is enabled copy the backed up values */
+ {
+ /* Luma */
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_luma_flag && no_loop_filter_enabled_luma)
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 loop_filter_bit_pos;
+ WORD32 log2_min_cu = 3;
+ WORD32 min_cu = (1 << log2_min_cu);
+ UWORD8 *pu1_src_tmp_luma = pu1_src_luma;
+ WORD32 sao_blk_ht = ctb_size - SAO_SHIFT_CTB;
+ WORD32 sao_blk_wd = ctb_size;
+ WORD32 remaining_rows;
+ WORD32 remaining_cols;
+
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - SAO_SHIFT_CTB);
+ if(remaining_rows <= SAO_SHIFT_CTB)
+ sao_blk_ht += remaining_rows;
+ if(remaining_cols <= SAO_SHIFT_CTB)
+ sao_blk_wd += remaining_cols;
+
+ pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB : 0;
+ pu1_src_tmp_luma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+ pu1_src_backup_luma = ps_sao_ctxt->pu1_tmp_buf_luma;
+
+ loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+ (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+ if(ps_sao_ctxt->i4_ctb_x > 0)
+ loop_filter_bit_pos -= 1;
+
+ pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+ (loop_filter_bit_pos >> 3);
+
+ for(i = -(ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB : 0) >> log2_min_cu;
+ i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+ {
+ WORD32 tmp_wd = sao_blk_wd;
+
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+ (loop_filter_bit_pos & 7);
+ u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ pu1_src_tmp_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_luma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ }
+ else
+ {
+ for(row = 0; row < min_cu; row++)
+ {
+ for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+ {
+ pu1_src_tmp_luma[row * src_strd + col] = pu1_src_backup_luma[row * backup_strd + col];
+ }
+ }
+ pu1_src_tmp_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_luma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ }
+ }
+
+ pu1_src_tmp_luma -= sao_blk_wd;
+ pu1_src_backup_luma -= sao_blk_wd;
+ }
+
+ pu1_src_tmp_luma += (src_strd << log2_min_cu);
+ pu1_src_backup_luma += (backup_strd << log2_min_cu);
+ }
+ }
+
+ /* Chroma */
+ if(ps_sao_ctxt->ps_slice_hdr->i1_slice_sao_chroma_flag && no_loop_filter_enabled_chroma)
+ {
+ UWORD32 u4_no_loop_filter_flag;
+ WORD32 loop_filter_bit_pos;
+ WORD32 log2_min_cu = 3;
+ WORD32 min_cu = (1 << log2_min_cu);
+ UWORD8 *pu1_src_tmp_chroma = pu1_src_chroma;
+ WORD32 sao_blk_ht = ctb_size - 2 * SAO_SHIFT_CTB;
+ WORD32 sao_blk_wd = ctb_size;
+ WORD32 remaining_rows;
+ WORD32 remaining_cols;
+
+ remaining_rows = ps_sps->i2_pic_height_in_luma_samples - ((ps_sao_ctxt->i4_ctb_y << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+ remaining_cols = ps_sps->i2_pic_width_in_luma_samples - ((ps_sao_ctxt->i4_ctb_x << log2_ctb_size) + ctb_size - 2 * SAO_SHIFT_CTB);
+ if(remaining_rows <= 2 * SAO_SHIFT_CTB)
+ sao_blk_ht += remaining_rows;
+ if(remaining_cols <= 2 * SAO_SHIFT_CTB)
+ sao_blk_wd += remaining_cols;
+
+ pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_x ? SAO_SHIFT_CTB * 2 : 0;
+ pu1_src_tmp_chroma -= ps_sao_ctxt->i4_ctb_y ? SAO_SHIFT_CTB * src_strd : 0;
+
+ pu1_src_backup_chroma = ps_sao_ctxt->pu1_tmp_buf_chroma;
+
+ loop_filter_bit_pos = (ps_sao_ctxt->i4_ctb_x << (log2_ctb_size - 3)) +
+ (ps_sao_ctxt->i4_ctb_y << (log2_ctb_size - 3)) * (loop_filter_strd << 3);
+ if(ps_sao_ctxt->i4_ctb_x > 0)
+ loop_filter_bit_pos -= 2;
+
+ pu1_no_loop_filter_flag = ps_sao_ctxt->pu1_pic_no_loop_filter_flag +
+ (loop_filter_bit_pos >> 3);
+
+ for(i = -(ps_sao_ctxt->i4_ctb_y ? 2 * SAO_SHIFT_CTB : 0) >> log2_min_cu;
+ i < (sao_blk_ht + (min_cu - 1)) >> log2_min_cu; i++)
+ {
+ WORD32 tmp_wd = sao_blk_wd;
+
+ u4_no_loop_filter_flag = (*(UWORD32 *)(pu1_no_loop_filter_flag + i * loop_filter_strd)) >>
+ (loop_filter_bit_pos & 7);
+ u4_no_loop_filter_flag &= (1 << ((tmp_wd + (min_cu - 1)) >> log2_min_cu)) - 1;
+
+ if(u4_no_loop_filter_flag)
+ {
+ while(tmp_wd > 0)
+ {
+ if(CTZ(u4_no_loop_filter_flag))
+ {
+ pu1_src_tmp_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_chroma += MIN((CTZ(u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(u4_no_loop_filter_flag));
+ }
+ else
+ {
+ for(row = 0; row < min_cu / 2; row++)
+ {
+ for(col = 0; col < MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd); col++)
+ {
+ pu1_src_tmp_chroma[row * src_strd + col] = pu1_src_backup_chroma[row * backup_strd + col];
+ }
+ }
+
+ pu1_src_tmp_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ pu1_src_backup_chroma += MIN((CTZ(~u4_no_loop_filter_flag) << log2_min_cu), tmp_wd);
+ tmp_wd -= CTZ(~u4_no_loop_filter_flag) << log2_min_cu;
+ u4_no_loop_filter_flag >>= (CTZ(~u4_no_loop_filter_flag));
+ }
+ }
+
+ pu1_src_tmp_chroma -= sao_blk_wd;
+ pu1_src_backup_chroma -= sao_blk_wd;
+ }
+
+ pu1_src_tmp_chroma += ((src_strd / 2) << log2_min_cu);
+ pu1_src_backup_chroma += ((backup_strd / 2) << log2_min_cu);
+ }
+ }
+ }
+
+}
+
diff --git a/decoder/ihevcd_sao.h b/decoder/ihevcd_sao.h
new file mode 100644
index 0000000..e549682
--- /dev/null
+++ b/decoder/ihevcd_sao.h
@@ -0,0 +1,40 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_sao.h
+*
+* @brief
+*
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_SAO_H_
+#define _IHEVCD_SAO_H_
+
+void ihevcd_sao_ctb(sao_ctxt_t *ps_sao_ctxt);
+void ihevcd_sao_shift_ctb(sao_ctxt_t *ps_sao_ctxt);
+
+#endif /*_IHEVC_SAO_H_*/
diff --git a/decoder/ihevcd_statistics.c b/decoder/ihevcd_statistics.c
new file mode 100644
index 0000000..f4e5242
--- /dev/null
+++ b/decoder/ihevcd_statistics.c
@@ -0,0 +1,688 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_statistics.c
+*
+* @brief
+* Contains macros for generating stats about hevc decoder
+*
+* @author
+* Naveen SR
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_defs.h"
+
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_statistics.h"
+
+#if STATISTICS_ENABLE
+statistics_t gs_ihevcd_stat;
+
+void ihevcd_init_trans_stat(stat_trans_t *ps_stat_trans)
+{
+ ps_stat_trans->num_4x4_dst = 0;
+ ps_stat_trans->num_4x4 = 0;
+ ps_stat_trans->num_8x8 = 0;
+ ps_stat_trans->num_16x16 = 0;
+ ps_stat_trans->num_32x32 = 0;
+ ps_stat_trans->num_64x64 = 0;
+};
+
+void ihevcd_sblk_pos_init()
+{
+ gs_ihevcd_stat.last_sblk_pos_x = 0;
+ gs_ihevcd_stat.last_sblk_pos_y = 0;
+ gs_ihevcd_stat.num_coded_sblk = 0;
+ gs_ihevcd_stat.num_coded_coeffs = 0;
+}
+void ihevcd_init_sblk_histogram(stat_sblk_histogram_t *ps_last_sblk_pos_histogram_t)
+{
+ memset(ps_last_sblk_pos_histogram_t->trans_4x4_dst, 0, 1 * sizeof(UWORD32));
+ memset(ps_last_sblk_pos_histogram_t->trans_4x4, 0, 1 * sizeof(UWORD32));
+ memset(ps_last_sblk_pos_histogram_t->trans_8x8, 0, 4 * sizeof(UWORD32));
+ memset(ps_last_sblk_pos_histogram_t->trans_16x16, 0, 16 * sizeof(UWORD32));
+ memset(ps_last_sblk_pos_histogram_t->trans_32x32, 0, 64 * sizeof(UWORD32));
+}
+void ihevcd_init_coeff_histogram(stat_coeff_histogram_t *ps_coeff_histogram)
+{
+ memset(ps_coeff_histogram->trans_4x4_dst, 0, 16 * sizeof(UWORD32));
+ memset(ps_coeff_histogram->trans_4x4, 0, 16 * sizeof(UWORD32));
+ memset(ps_coeff_histogram->trans_8x8, 0, 64 * sizeof(UWORD32));
+ memset(ps_coeff_histogram->trans_16x16, 0, 256 * sizeof(UWORD32));
+ memset(ps_coeff_histogram->trans_32x32, 0, 1024 * sizeof(UWORD32));
+}
+void ihevcd_init_statistics()
+{
+
+ memset(&gs_ihevcd_stat, 0, sizeof(statistics_t));
+ /* Number of transform block init */
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_all_trans_block[0]);
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_all_trans_block[1]);
+ /* Number of coded transform block init */
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_trans_block[0]);
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_trans_block[1]);
+ /* Number of coded DC transform block init */
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_dc_block[0]);
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_dc_block[1]);
+ /* Number of coded one coeff transform block init */
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_one_coeff_block[0]);
+ ihevcd_init_trans_stat(&gs_ihevcd_stat.stat_num_coded_one_coeff_block[1]);
+ /* Last sblk histogram init */
+ ihevcd_init_sblk_histogram(&gs_ihevcd_stat.stat_last_sblk_pos_histogram);
+ /* Num Coded sblk histogram init */
+ ihevcd_init_sblk_histogram(&gs_ihevcd_stat.stat_num_coded_sblk_histogram);
+ /* Num Coded coeffs histogram init */
+ ihevcd_init_coeff_histogram(&gs_ihevcd_stat.stat_num_coded_coeff_histogram);
+ /* Last sblk position init */
+ ihevcd_sblk_pos_init();
+
+}
+
+void ihevcd_print_stat_trans(stat_trans_t *ps_stat_trans)
+{
+ WORD32 total_pixels_y, total_pixels_uv;
+ double y_ratio, y_ratio_total, uv_ratio, uv_ratio_total;
+ stat_trans_t *ps_stat_trans_all;
+ total_pixels_y = ps_stat_trans[0].num_4x4_dst * 4 * 4 +
+ ps_stat_trans[0].num_4x4 * 4 * 4 +
+ ps_stat_trans[0].num_8x8 * 8 * 8 +
+ ps_stat_trans[0].num_16x16 * 16 * 16 +
+ ps_stat_trans[0].num_32x32 * 32 * 32 +
+ ps_stat_trans[0].num_64x64 * 64 * 64;
+
+ total_pixels_uv = ps_stat_trans[1].num_4x4_dst * 4 * 4 +
+ ps_stat_trans[1].num_4x4 * 4 * 4 +
+ ps_stat_trans[1].num_8x8 * 8 * 8 +
+ ps_stat_trans[1].num_16x16 * 16 * 16 +
+ ps_stat_trans[1].num_32x32 * 32 * 32 +
+ ps_stat_trans[1].num_64x64 * 64 * 64;
+
+ ps_stat_trans_all = &gs_ihevcd_stat.stat_num_all_trans_block[0];
+
+ printf("\n_ Y Y Y U+V U+V U+V");
+ printf("\nTransform_Type Num_Blocks Percentage %%wrt_total Num_Blocks Percentage %%wrt_total ");
+
+ y_ratio = ps_stat_trans[0].num_4x4_dst * 4 * 4 * 100.0 / total_pixels_y;
+ y_ratio_total = ps_stat_trans[0].num_4x4_dst * 100.0 / ps_stat_trans_all[0].num_4x4_dst;
+ uv_ratio = ps_stat_trans[1].num_4x4_dst * 4 * 4 * 100.0 / total_pixels_uv;
+ uv_ratio_total = ps_stat_trans[1].num_4x4_dst * 100.0 / ps_stat_trans_all[1].num_4x4_dst;
+ printf("\nDST_4x4 %6d %6.2f %6.2f %6d %6.2f %6.2f ", ps_stat_trans[0].num_4x4_dst, y_ratio, y_ratio_total, ps_stat_trans[1].num_4x4_dst, uv_ratio, uv_ratio_total);
+
+ y_ratio = ps_stat_trans[0].num_4x4 * 4 * 4 * 100.0 / total_pixels_y;
+ y_ratio_total = ps_stat_trans[0].num_4x4 * 100.0 / ps_stat_trans_all[0].num_4x4;
+ uv_ratio = ps_stat_trans[1].num_4x4 * 4 * 4 * 100.0 / total_pixels_uv;
+ uv_ratio_total = ps_stat_trans[1].num_4x4 * 100.0 / ps_stat_trans_all[1].num_4x4;
+ printf("\nDCT_4x4 %6d %6.2f %6.2f %6d %6.2f %6.2f ", ps_stat_trans[0].num_4x4, y_ratio, y_ratio_total, ps_stat_trans[1].num_4x4, uv_ratio, uv_ratio_total);
+
+
+ y_ratio = ps_stat_trans[0].num_8x8 * 8 * 8 * 100.0 / total_pixels_y;
+ y_ratio_total = ps_stat_trans[0].num_8x8 * 100.0 / ps_stat_trans_all[0].num_8x8;
+ uv_ratio = ps_stat_trans[1].num_8x8 * 8 * 8 * 100.0 / total_pixels_uv;
+ uv_ratio_total = ps_stat_trans[1].num_8x8 * 100.0 / ps_stat_trans_all[1].num_8x8;
+ printf("\nDCT_8x8 %6d %6.2f %6.2f %6d %6.2f %6.2f ", ps_stat_trans[0].num_8x8, y_ratio, y_ratio_total, ps_stat_trans[1].num_8x8, uv_ratio, uv_ratio_total);
+
+ y_ratio = ps_stat_trans[0].num_16x16 * 16 * 16 * 100.0 / total_pixels_y;
+ y_ratio_total = ps_stat_trans[0].num_16x16 * 100.0 / ps_stat_trans_all[0].num_16x16;
+ uv_ratio = ps_stat_trans[1].num_16x16 * 16 * 16 * 100.0 / total_pixels_uv;
+ uv_ratio_total = ps_stat_trans[1].num_16x16 * 100.0 / ps_stat_trans_all[1].num_16x16;
+ printf("\nDCT_16x16 %6d %6.2f %6.2f %6d %6.2f %6.2f ", ps_stat_trans[0].num_16x16, y_ratio, y_ratio_total, ps_stat_trans[1].num_16x16, uv_ratio, uv_ratio_total);
+
+
+ y_ratio = ps_stat_trans[0].num_32x32 * 32 * 32 * 100.0 / total_pixels_y;
+ y_ratio_total = ps_stat_trans[0].num_32x32 * 100.0 / ps_stat_trans_all[0].num_32x32;
+ uv_ratio = ps_stat_trans[1].num_32x32 * 32 * 32 * 100.0 / total_pixels_uv;
+ uv_ratio_total = ps_stat_trans[1].num_32x32 * 100.0 / ps_stat_trans_all[1].num_32x32;
+ printf("\nDCT_32x32 %6d %6.2f %6.2f %6d %6.2f %6.2f ", ps_stat_trans[0].num_32x32, y_ratio, y_ratio_total, ps_stat_trans[1].num_32x32, uv_ratio, uv_ratio_total);
+
+
+ y_ratio = ps_stat_trans[0].num_64x64 * 64 * 64 * 100.0 / total_pixels_y;
+ y_ratio_total = ps_stat_trans[0].num_64x64 * 100.0 / ps_stat_trans_all[0].num_64x64;
+ uv_ratio = ps_stat_trans[1].num_64x64 * 64 * 64 * 100.0 / total_pixels_uv;
+ uv_ratio_total = ps_stat_trans[1].num_64x64 * 100.0 / ps_stat_trans_all[1].num_64x64;
+ printf("\nDCT_64x64 %6d %6.2f %6.2f %6d %6.2f %6.2f ", ps_stat_trans[0].num_64x64, y_ratio, y_ratio_total, ps_stat_trans[1].num_64x64, uv_ratio, uv_ratio_total);
+
+}
+
+void ihevcd_update_stat_num_trans(stat_trans_t *ps_stat_trans, TRANSFORM_TYPE e_trans_type)
+{
+ switch(e_trans_type)
+ {
+ case DST_4x4:
+ ps_stat_trans->num_4x4_dst++;
+ break;
+ case DCT_4x4:
+ ps_stat_trans->num_4x4++;
+ break;
+ case DCT_8x8:
+ ps_stat_trans->num_8x8++;
+ break;
+ case DCT_16x16:
+ ps_stat_trans->num_16x16++;
+ break;
+ case DCT_32x32:
+ ps_stat_trans->num_32x32++;
+ break;
+ case SKIP_64x64:
+ ps_stat_trans->num_64x64++;
+ break;
+ default:
+ break;
+ }
+}
+
+void ihevcd_update_num_all_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx)
+{
+ stat_trans_t *ps_stat_trans;
+
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_all_trans_block[0];
+
+ if(c_idx != 0)
+ {
+ ps_stat_trans++;
+ }
+ ihevcd_update_stat_num_trans(ps_stat_trans, e_trans_type);
+}
+
+void ihevcd_update_num_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx, WORD32 update_type)
+{
+ stat_trans_t *ps_stat_trans;
+
+ if(0 == update_type)
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_trans_block[0];
+ else if(1 == update_type)
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_dc_block[0];
+ else
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_one_coeff_block[0];
+
+ if(c_idx != 0)
+ {
+ ps_stat_trans++;
+ }
+ ihevcd_update_stat_num_trans(ps_stat_trans, e_trans_type);
+}
+
+void ihevcd_print_sblk_histogram_per_transform(UWORD32 *pu4_stat, UWORD32 wd, UWORD32 ht, WORD32 is_2d)
+{
+ UWORD32 i, j, total = 0, val;
+
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++)
+ {
+ val = pu4_stat[j + i * ht];
+ printf("%d\t\t", val);
+ total += val;
+ }
+ if(1 == is_2d)
+ printf("\n");
+ }
+
+ {
+ printf("\n");
+ for(i = 0; i < ht; i++)
+ {
+ for(j = 0; j < wd; j++)
+ {
+ val = pu4_stat[j + i * ht];
+
+ printf("%.2f%%\t\t", val * 100.0 / total);
+ }
+ if(1 == is_2d)
+ printf("\n");
+ }
+ }
+}
+
+void ihevcd_print_sblk_histogram(stat_sblk_histogram_t *ps_stat_sblk_pos_histogram, WORD32 is_2d)
+{
+ printf("\nhistogram_4x4_DST\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_4x4_dst, 1, 1, is_2d);
+ printf("\nhistogram_4x4\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_4x4, 1, 1, is_2d);
+ printf("\nhistogram_8x8\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_8x8, 2, 2, is_2d);
+ printf("\nhistogram_16x16\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_16x16, 4, 4, is_2d);
+ printf("\nhistogram_32x32\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_sblk_pos_histogram->trans_32x32, 8, 8, is_2d);
+}
+
+void ihevcd_print_coeff_histogram(stat_coeff_histogram_t *ps_stat_coeff_histogram, WORD32 is_2d)
+{
+ printf("\nhistogram_4x4_DST\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_4x4_dst, 4, 4, is_2d);
+ printf("\nhistogram_4x4\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_4x4, 4, 4, is_2d);
+ printf("\nhistogram_8x8\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_8x8, 8, 8, is_2d);
+ printf("\nhistogram_16x16\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_16x16, 16, 16, is_2d);
+ printf("\nhistogram_32x32\n");
+ ihevcd_print_sblk_histogram_per_transform(ps_stat_coeff_histogram->trans_32x32, 32, 32, is_2d);
+}
+void ihevcd_print_transform_statistics()
+{
+ stat_trans_t *ps_stat_trans;
+ WORD32 total_blocks;
+
+ /* Num coded_transform blocks */
+ printf("\nNUM_ALL_TRANSFORM_BLOCKS\n");
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_all_trans_block[0];
+ {
+ /* Updating chroma blocks. As the chroma blocks are not counted if cbf of y,u and v are zero */
+ ps_stat_trans[1].num_4x4 = (ps_stat_trans[0].num_4x4_dst + ps_stat_trans[0].num_4x4) / 4 + ps_stat_trans->num_8x8;
+ ps_stat_trans[1].num_8x8 = ps_stat_trans->num_16x16;
+ ps_stat_trans[1].num_16x16 = ps_stat_trans->num_32x32;
+ ps_stat_trans[1].num_32x32 = ps_stat_trans->num_64x64;
+ }
+ ihevcd_print_stat_trans(ps_stat_trans);
+
+ /* Num coded_transform blocks */
+ printf("\nNUM_CODED_TRANSFORM_BLOCKS(excluding_trans_skip_and_trans_quant_bypass)\n");
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_trans_block[0];
+ ihevcd_print_stat_trans(ps_stat_trans);
+
+ /* Num DC transform blocks */
+ printf("\nNUM_DC_TRANSFORM_BLOCKS(excluding_trans_skip_and_trans_quant_bypass)\n");
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_dc_block[0];
+ ihevcd_print_stat_trans(ps_stat_trans);
+
+ /* Num one coeff transform blocks */
+ printf("\nNUM_ONE_COEFF_TRANSFORM_BLOCKS(excluding_trans_skip_and_trans_quant_bypass)\n");
+ ps_stat_trans = &gs_ihevcd_stat.stat_num_coded_one_coeff_block[0];
+ ihevcd_print_stat_trans(ps_stat_trans);
+
+ /* Last sblk histogram */
+ printf("\nLAST_CODED_SBLK_HISTOGRAM\n");
+ ihevcd_print_sblk_histogram(&gs_ihevcd_stat.stat_last_sblk_pos_histogram, 1);
+
+ /* Num Coded sblks histogram */
+ printf("\nNUM_CODED_SBLK_HISTOGRAM\n");
+ ihevcd_print_sblk_histogram(&gs_ihevcd_stat.stat_num_coded_sblk_histogram, 1);
+
+ /* Num Coded coeff histogram */
+ printf("\nNUM_CODED_COEFF_HISTOGRAM\n");
+ ihevcd_print_coeff_histogram(&gs_ihevcd_stat.stat_num_coded_coeff_histogram, 1);
+}
+
+void ihevcd_update_sblk_histogram(stat_sblk_histogram_t *ps_last_sblk_pos_histogram, TRANSFORM_TYPE e_trans_type, WORD32 last_sblk_x, WORD32 last_sblk_y)
+{
+ switch(e_trans_type)
+ {
+ case DST_4x4:
+ ps_last_sblk_pos_histogram->trans_4x4_dst[last_sblk_x + last_sblk_y * 0]++;
+ break;
+ case DCT_4x4:
+ ps_last_sblk_pos_histogram->trans_4x4[last_sblk_x + last_sblk_y * 0]++;
+ break;
+ case DCT_8x8:
+ ps_last_sblk_pos_histogram->trans_8x8[last_sblk_x + last_sblk_y * 2]++;
+ break;
+ case DCT_16x16:
+ ps_last_sblk_pos_histogram->trans_16x16[last_sblk_x + last_sblk_y * 4]++;
+ break;
+ case DCT_32x32:
+ ps_last_sblk_pos_histogram->trans_32x32[last_sblk_x + last_sblk_y * 8]++;
+ break;
+ default:
+ break;
+ }
+}
+
+void ihevcd_update_num_coded_sblk_histogram(stat_sblk_histogram_t *ps_sblk_histogram, TRANSFORM_TYPE e_trans_type, WORD32 num_coded_blks)
+{
+ switch(e_trans_type)
+ {
+ case DST_4x4:
+ ps_sblk_histogram->trans_4x4_dst[num_coded_blks - 1]++;
+ break;
+ case DCT_4x4:
+ ps_sblk_histogram->trans_4x4[num_coded_blks - 1]++;
+ break;
+ case DCT_8x8:
+ ps_sblk_histogram->trans_8x8[num_coded_blks - 1]++;
+ break;
+ case DCT_16x16:
+ ps_sblk_histogram->trans_16x16[num_coded_blks - 1]++;
+ break;
+ case DCT_32x32:
+ ps_sblk_histogram->trans_32x32[num_coded_blks - 1]++;
+ break;
+ default:
+ break;
+ }
+}
+
+void ihevcd_update_num_coded_coeff_histogram(stat_coeff_histogram_t *ps_coeff_histogram, TRANSFORM_TYPE e_trans_type, WORD32 num_coded_blks)
+{
+ switch(e_trans_type)
+ {
+ case DST_4x4:
+ ps_coeff_histogram->trans_4x4_dst[num_coded_blks - 1]++;
+ break;
+ case DCT_4x4:
+ ps_coeff_histogram->trans_4x4[num_coded_blks - 1]++;
+ break;
+ case DCT_8x8:
+ ps_coeff_histogram->trans_8x8[num_coded_blks - 1]++;
+ break;
+ case DCT_16x16:
+ ps_coeff_histogram->trans_16x16[num_coded_blks - 1]++;
+ break;
+ case DCT_32x32:
+ ps_coeff_histogram->trans_32x32[num_coded_blks - 1]++;
+ break;
+ default:
+ break;
+ }
+}
+
+void ihevcd_sblk_pos_update(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass, UWORD32 sblk_x, UWORD32 sblk_y)
+{
+ if(1 == t_skip_or_tq_bypass)
+ return;
+
+ gs_ihevcd_stat.num_coded_sblk++;
+
+ /* Updating the last coded sblk pos */
+#if 0
+ if(gs_ihevcd_stat.last_sblk_pos_y > sblk_y)
+ return;
+ else if(gs_ihevcd_stat.last_sblk_pos_y == sblk_y)
+ {
+ if(gs_ihevcd_stat.last_sblk_pos_x >= sblk_x)
+ return;
+ else
+ gs_ihevcd_stat.last_sblk_pos_x = sblk_x;
+ }
+ else
+ {
+ gs_ihevcd_stat.last_sblk_pos_y = sblk_y;
+ gs_ihevcd_stat.last_sblk_pos_x = sblk_x;
+ }
+#endif
+ if(gs_ihevcd_stat.last_sblk_pos_y < sblk_y)
+ gs_ihevcd_stat.last_sblk_pos_y = sblk_y;
+
+ if(gs_ihevcd_stat.last_sblk_pos_x < sblk_x)
+ gs_ihevcd_stat.last_sblk_pos_x = sblk_x;
+}
+
+void ihevcd_update_coeff_count()
+{
+ gs_ihevcd_stat.num_coded_coeffs++;
+}
+
+void ihevcd_update_sblk_and_coeff_histogram(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass)
+{
+ if(0 == t_skip_or_tq_bypass)
+ {
+ ihevcd_update_sblk_histogram(&gs_ihevcd_stat.stat_last_sblk_pos_histogram, e_trans_type, gs_ihevcd_stat.last_sblk_pos_x, gs_ihevcd_stat.last_sblk_pos_y);
+ ihevcd_update_num_coded_sblk_histogram(&gs_ihevcd_stat.stat_num_coded_sblk_histogram, e_trans_type, gs_ihevcd_stat.num_coded_sblk);
+ ihevcd_update_num_coded_coeff_histogram(&gs_ihevcd_stat.stat_num_coded_coeff_histogram, e_trans_type, gs_ihevcd_stat.num_coded_coeffs);
+ }
+}
+
+void ihevcd_update_pu_skip_size(pu_t *ps_pu)
+{
+ WORD32 wd, ht;
+
+ wd = (ps_pu->b4_wd);
+ ht = (ps_pu->b4_ht);
+
+ gs_ihevcd_stat.stat_pu_skip_size_hist[wd][ht]++;
+}
+void ihevcd_update_pu_size(pu_t *ps_pu)
+{
+ WORD32 wd, ht;
+
+ wd = (ps_pu->b4_wd);
+ ht = (ps_pu->b4_ht);
+ gs_ihevcd_stat.stat_pu_all_size_hist[wd][ht]++;
+ if(ps_pu->b1_intra_flag)
+ {
+ gs_ihevcd_stat.stat_pu_intra_size_hist[wd][ht]++;
+ }
+ else
+ {
+ gs_ihevcd_stat.stat_pu_inter_size_hist[wd][ht]++;
+
+
+
+ if(ps_pu->b1_merge_flag)
+ gs_ihevcd_stat.stat_pu_merge_size_hist[wd][ht]++;
+
+ if(ps_pu->b2_pred_mode == PRED_BI)
+ gs_ihevcd_stat.stat_pu_bipred_size_hist[wd][ht]++;
+
+ switch(ps_pu->b2_pred_mode)
+ {
+ case PRED_L0:
+ if((ps_pu->mv.s_l0_mv.i2_mvx == 0) &&
+ (ps_pu->mv.s_l0_mv.i2_mvy == 0))
+ {
+ gs_ihevcd_stat.stat_pu_zeromv_size_hist[wd][ht]++;
+ }
+
+ if((ABS(ps_pu->mv.s_l0_mv.i2_mvx) < 4) &&
+ (ABS(ps_pu->mv.s_l0_mv.i2_mvy) < 4))
+ {
+ gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[wd][ht]++;
+ }
+
+ break;
+
+ case PRED_L1:
+ if((ps_pu->mv.s_l1_mv.i2_mvx == 0) &&
+ (ps_pu->mv.s_l1_mv.i2_mvy == 0))
+ {
+ gs_ihevcd_stat.stat_pu_zeromv_size_hist[wd][ht]++;
+ }
+
+ if((ABS(ps_pu->mv.s_l1_mv.i2_mvx) < 4) &&
+ (ABS(ps_pu->mv.s_l1_mv.i2_mvy) < 4))
+ {
+ gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[wd][ht]++;
+ }
+ break;
+
+
+ case PRED_BI:
+ if((ps_pu->mv.s_l0_mv.i2_mvx == 0) &&
+ (ps_pu->mv.s_l0_mv.i2_mvy == 0) &&
+ (ps_pu->mv.s_l1_mv.i2_mvx == 0) &&
+ (ps_pu->mv.s_l1_mv.i2_mvy == 0))
+ {
+ gs_ihevcd_stat.stat_pu_zeromv_size_hist[wd][ht]++;
+ }
+ if((ABS(ps_pu->mv.s_l0_mv.i2_mvx) < 4) &&
+ (ABS(ps_pu->mv.s_l0_mv.i2_mvy) < 4) &&
+ (ABS(ps_pu->mv.s_l1_mv.i2_mvx) < 4) &&
+ (ABS(ps_pu->mv.s_l1_mv.i2_mvy) < 4))
+ {
+ gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[wd][ht]++;
+ }
+
+ break;
+
+ }
+ }
+}
+
+
+void ihevcd_print_pu_size_hist(UWORD32 *pu4_buf)
+{
+ WORD32 i, j;
+
+
+ for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+ {
+ for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+ {
+ printf("%12d ", pu4_buf[j]);
+ }
+ pu4_buf += (MAX_CTB_SIZE / MIN_PU_SIZE);
+ printf("\n");
+ }
+}
+
+void ihevcd_print_pu_size_hist_normalized(UWORD32 *pu4_buf)
+{
+ WORD32 i, j;
+ WORD32 sum;
+ UWORD32 *pu4_buf_orig = pu4_buf;
+ sum = 0;
+
+ for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+ {
+ for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+ {
+ sum += pu4_buf[j] * (i + 1) * (j + 1) * 16;
+ }
+ pu4_buf += (MAX_CTB_SIZE / MIN_PU_SIZE);
+ }
+
+ pu4_buf = pu4_buf_orig;
+ for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+ {
+ for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+ {
+ double num = pu4_buf[j] * (i + 1) * (j + 1) * 16 * 100.0;
+ printf("%6.2f ", num / sum);
+ }
+ pu4_buf += (MAX_CTB_SIZE / MIN_PU_SIZE);
+ printf("\n");
+ }
+}
+
+void ihevcd_print_pu_size_hist_percentage(UWORD32 *pu4_num, UWORD32 *pu4_denom)
+{
+ WORD32 i, j;
+
+
+ for(i = 0; i < (MAX_CTB_SIZE / MIN_PU_SIZE); i++)
+ {
+ for(j = 0; j < (MAX_CTB_SIZE / MIN_PU_SIZE); j++)
+ {
+ double val;
+ val = 0;
+ if(pu4_denom[j])
+ {
+ val = (pu4_num[j] * 100.0) / pu4_denom[j];
+ printf("%6.2f ", val);
+ }
+ else
+ {
+ if(0 == pu4_num[j])
+ printf("%6.2f ", 0.0);
+ else
+ printf("NaN ");
+ }
+ }
+ pu4_num += (MAX_CTB_SIZE / MIN_PU_SIZE);
+ pu4_denom += (MAX_CTB_SIZE / MIN_PU_SIZE);
+ printf("\n");
+ }
+}
+
+void ihevcd_print_pu_statistics()
+{
+
+ printf("\n\nPU Sizes\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Intra\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_intra_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Inter\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_inter_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Skip\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_skip_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Merge\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_merge_size_hist[0][0]);
+
+ printf("\n\nPU Sizes BiPred\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_bipred_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Zero MV\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_zeromv_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Zero MV including subpel MV less than +/- 1 in fullpel units\n\n");
+ ihevcd_print_pu_size_hist(&gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[0][0]);
+
+ printf("\n\nPU Sizes percentage \n\n");
+ ihevcd_print_pu_size_hist_normalized(&gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Intra ratio w.r.t all PUs\n\n");
+ ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_intra_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Inter ratio w.r.t all PUs\n\n");
+ ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_inter_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Skip ratio w.r.t all PUs\n\n");
+ ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_skip_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Merge ratio w.r.t all PUs\n\n");
+ ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_merge_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes BiPred ratio w.r.t all PUs\n\n");
+ ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_bipred_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Zero MV ratio w.r.t all PUs\n\n");
+ ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_zeromv_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+ printf("\n\nPU Sizes Zero MV including subpel MV less than +/- 1 in fullpel units ratio w.r.t all PUs\n\n");
+ ihevcd_print_pu_size_hist_percentage(&gs_ihevcd_stat.stat_pu_zeromvfpel_size_hist[0][0], &gs_ihevcd_stat.stat_pu_all_size_hist[0][0]);
+
+}
+
+void ihevcd_print_statistics()
+{
+ ihevcd_print_transform_statistics();
+ ihevcd_print_pu_statistics();
+}
+#endif
diff --git a/decoder/ihevcd_statistics.h b/decoder/ihevcd_statistics.h
new file mode 100644
index 0000000..58f35d6
--- /dev/null
+++ b/decoder/ihevcd_statistics.h
@@ -0,0 +1,149 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_statistics.h
+*
+* @brief
+* Contains macros for generating stats about hevc decoder
+*
+* @author
+* Naveen SR
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_STATISTICS_H_
+#define _IHEVCD_STATISTICS_H_
+
+#include <stdio.h>
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_structs.h"
+
+#include "ihevc_cabac_tables.h"
+#include "ihevcd_defs.h"
+
+#include "ihevcd_structs.h"
+#include "ihevcd_iquant_itrans_recon_ctb.h"
+#include "ihevcd_statistics.h"
+
+#define STATISTICS_ENABLE 0
+
+#if STATISTICS_ENABLE
+
+typedef struct
+{
+ UWORD32 num_4x4_dst;
+ UWORD32 num_4x4;
+ UWORD32 num_8x8;
+ UWORD32 num_16x16;
+ UWORD32 num_32x32;
+ UWORD32 num_64x64;
+}stat_trans_t;
+
+typedef struct
+{
+ /* 4x4 Subblock count */
+ UWORD32 trans_4x4_dst[1];
+ UWORD32 trans_4x4[1];
+ UWORD32 trans_8x8[4];
+ UWORD32 trans_16x16[16];
+ UWORD32 trans_32x32[64];
+}stat_sblk_histogram_t;
+
+typedef struct
+{
+ /* 4x4 Subblock count */
+ UWORD32 trans_4x4_dst[16];
+ UWORD32 trans_4x4[16];
+ UWORD32 trans_8x8[64];
+ UWORD32 trans_16x16[256];
+ UWORD32 trans_32x32[1024];
+}stat_coeff_histogram_t;
+
+typedef struct
+{
+ stat_trans_t stat_num_all_trans_block[2]; /* Y and UV */
+ stat_trans_t stat_num_coded_trans_block[2]; /* Y and UV */
+ stat_trans_t stat_num_coded_dc_block[2]; /* Y and UV */
+ stat_trans_t stat_num_coded_one_coeff_block[2]; /* Y and UV */
+ stat_sblk_histogram_t stat_last_sblk_pos_histogram; /* Y + UV */
+ stat_sblk_histogram_t stat_num_coded_sblk_histogram; /* Y + UV */
+ stat_coeff_histogram_t stat_num_coded_coeff_histogram; /* Y + UV */
+ UWORD32 stat_pu_all_size_hist[16][16]; /* PU Sizes [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 stat_pu_skip_size_hist[16][16]; /* PU sizes for skip [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 stat_pu_inter_size_hist[16][16]; /* PU sizes for inter [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 stat_pu_intra_size_hist[16][16]; /* PU sizes for intra [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 stat_pu_bipred_size_hist[16][16]; /* PU sizes for bipred [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 stat_pu_merge_size_hist[16][16]; /* PU sizes for merge [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 stat_pu_zeromv_size_hist[16][16]; /* PU sizes for Zero MV [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 stat_pu_zeromvfpel_size_hist[16][16]; /* PU sizes for Zero MV (includes subpel less than +/- 1 full pel units [Width from 4 to 64 in steps of 4] [Height from 4 to 64 in steps of 4]*/
+ UWORD32 last_sblk_pos_x; /* Last sblk pos of transform block in processing */
+ UWORD32 last_sblk_pos_y;
+ UWORD32 num_coded_sblk;
+ UWORD32 num_coded_coeffs;
+}statistics_t;
+
+void ihevcd_update_num_all_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx);
+void ihevcd_update_num_trans_blocks(TRANSFORM_TYPE e_trans_type, WORD32 c_idx, WORD32 update_type);
+void ihevcd_update_sblk_and_coeff_histogram(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass);
+void ihevcd_sblk_pos_init();
+void ihevcd_sblk_pos_update(TRANSFORM_TYPE e_trans_type, WORD32 t_skip_or_tq_bypass, UWORD32 sblk_x, UWORD32 sblk_y);
+void ihevcd_print_transform_statistics();
+void ihevcd_update_coeff_count();
+void ihevcd_update_pu_size(pu_t *ps_pu);
+void ihevcd_update_pu_skip_size(pu_t *ps_pu);
+#endif //STATISTICS_ENABLE
+
+#if STATISTICS_ENABLE
+#define STATS_INIT() ihevcd_init_statistics();
+#define STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx) ihevcd_update_num_all_trans_blocks(e_trans_type, c_idx);
+#define STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, update_type) ihevcd_update_num_trans_blocks(e_trans_type, c_idx, update_type);
+#define STATS_PRINT() ihevcd_print_statistics();
+#define STATS_INIT_SBLK_AND_COEFF_POS() ihevcd_sblk_pos_init();
+#define STATS_LAST_SBLK_POS_UPDATE(e_trans_type, t_skip_or_tq_bypass, sblk_x, sblk_y) ihevcd_sblk_pos_update(e_trans_type, t_skip_or_tq_bypass, sblk_x, sblk_y);
+#define STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, t_skip_or_tq_bypass) ihevcd_update_sblk_and_coeff_histogram(e_trans_type, t_skip_or_tq_bypass);
+#define STATS_UPDATE_COEFF_COUNT() ihevcd_update_coeff_count();
+#define STATS_UPDATE_PU_SIZE(ps_pu) ihevcd_update_pu_size(ps_pu);
+#define STATS_UPDATE_PU_SKIP_SIZE(ps_pu) ihevcd_update_pu_skip_size(ps_pu);
+#else
+#define STATS_INIT() ;
+#define STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx) ;
+#define STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, update_type) ;
+#define STATS_PRINT() ;
+#define STATS_INIT_SBLK_AND_COEFF_POS() ;
+#define STATS_LAST_SBLK_POS_UPDATE(e_trans_type, t_skip_or_tq_bypass, sblk_x, sblk_y) ;
+#define STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, t_skip_or_tq_bypass) ;
+#define STATS_UPDATE_COEFF_COUNT() ;
+#define STATS_UPDATE_PU_SIZE(ps_pu) ;
+#define STATS_UPDATE_PU_SKIP_SIZE(ps_pu) ;
+#endif
+
+#endif /* _IHEVCD_STATISTICS_H_ */
diff --git a/decoder/ihevcd_structs.h b/decoder/ihevcd_structs.h
new file mode 100644
index 0000000..00e9a49
--- /dev/null
+++ b/decoder/ihevcd_structs.h
@@ -0,0 +1,2286 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+
+/**
+ *******************************************************************************
+ * @file
+ * ihevcd_structs.h
+ *
+ * @brief
+ * Structure definitions used in the decoder
+ *
+ * @author
+ * Harish
+ *
+ * @par List of Functions:
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+#ifndef _IHEVCD_STRUCTS_H_
+#define _IHEVCD_STRUCTS_H_
+typedef enum
+{
+ INIT_DONE, HEADER_DONE, FIRST_FRAME_DONE,
+}CODEC_STATE_T;
+
+
+
+typedef struct _codec_t codec_t;
+
+/** Structure to hold format conversion context */
+typedef struct
+{
+ /** Current row for which format conversion should be done */
+ WORD32 i4_cur_row;
+
+ /** Number of rows for which format conversion should be done */
+ WORD32 i4_num_rows;
+}fmt_conv_t;
+
+/**
+ * Bitstream structure
+ */
+typedef struct
+{
+ /**
+ * Bitstream buffer base pointer
+ */
+ UWORD8 *pu1_buf_base;
+
+ /**
+ * Bitstream bit offset in current word. Value between 0 and 31
+ */
+ UWORD32 u4_bit_ofst;
+
+ /**
+ * Current bitstream buffer pointer
+ */
+ UWORD32 *pu4_buf;
+
+ /**
+ * Current word
+ */
+ UWORD32 u4_cur_word;
+
+ /**
+ * Next word
+ */
+ UWORD32 u4_nxt_word;
+
+ /**
+ * Max address for bitstream
+ */
+ UWORD8 *pu1_buf_max;
+}bitstrm_t;
+
+/**
+******************************************************************************
+ * @brief Cabac context for decoder
+******************************************************************************
+ */
+typedef struct cab_ctxt
+{
+ /*********************************************************************/
+ /* CABAC ENGINE related fields */
+ /*********************************************************************/
+ /** cabac interval range R */
+ UWORD32 u4_range;
+
+ /** cabac interval offset O */
+ UWORD32 u4_ofst;
+
+ /*********************************************************************/
+ /* CABAC context models */
+ /*********************************************************************/
+ /** All Context models stored in pscked form pState[bits6-1] | MPS[bit0] */
+ UWORD8 au1_ctxt_models[IHEVC_CAB_CTXT_END];
+
+ /** Context models memorized after decoding 2nd CTB in a row to be used
+ * during entropy sync cases
+ */
+ UWORD8 au1_ctxt_models_sync[IHEVC_CAB_CTXT_END];
+
+}cab_ctxt_t;
+
+typedef enum
+{
+ CMD_PROCESS,
+ CMD_FMTCONV,
+}JOBQ_CMD_T;
+
+/**
+ * Structure to represent a processing job entry
+ */
+typedef struct
+{
+ /**
+ * Command
+ * Currently: PROCESS, FMTCONV are the only two jobs
+ */
+ WORD32 i4_cmd;
+ /**
+ * CTB x of the starting CTB
+ */
+ WORD16 i2_ctb_x;
+
+ /**
+ * CTB y of the starting CTB
+ */
+
+ WORD16 i2_ctb_y;
+
+ /**
+ * Number of CTBs that need to be processed in this job
+ */
+ WORD16 i2_ctb_cnt;
+
+ /**
+ * Slice index for the current CTB
+ */
+ WORD16 i2_slice_idx;
+
+ /**
+ * TU coefficient data offset for the current job
+ */
+ WORD32 i4_tu_coeff_data_ofst;
+#ifdef GPU_BUILD
+ /**
+ * OpenCL Granularity
+ */
+ WORD16 i2_granularity_idx;
+
+ /**
+ * Index to the process context
+ */
+ //WORD16 i2_proc_idx;
+
+ /**
+ * GPU Wait or NOT
+ */
+ WORD16 i2_wait;
+#endif
+}proc_job_t;
+/**
+ * Structure to represent a MV Bank buffer
+ */
+typedef struct
+{
+ /**
+ * Pointer to hold PU index for each CTB in a picture
+ */
+ UWORD32 *pu4_pic_pu_idx;
+
+ /**
+ * Pointer to hold pu_t for each PU in a picture
+ */
+ pu_t *ps_pic_pu;
+
+ /**
+ * Pointer to hold PU map for each CTB in a picture
+ */
+ UWORD8 *pu1_pic_pu_map;
+
+ /**
+ * Pointer to hold the Slice map
+ */
+ UWORD16 *pu1_pic_slice_map;
+
+ /**
+ * Absolute POC for the current MV Bank
+ */
+ WORD32 i4_abs_poc;
+
+ /**
+ * Absolute POCs of reference List 0 for all slices in the frame from which this frame is reconstructed
+ */
+ WORD32 l0_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+
+ /**
+ * Flag to indicate Long Term reference for POCs of reference List 0 for all slices in the frame from which this frame is reconstructed
+ */
+ WORD8 u1_l0_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+
+ /**
+ * Absolute POCs of reference List 1 for all slices in the frame from which this frame is reconstructed
+ */
+ WORD32 l1_collocated_poc[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+ /**
+ * Flag to indicate Long Term reference for POCs of reference List 1 for all slices in the frame from which this frame is reconstructed
+ */
+ WORD32 u1_l1_collocated_poc_lt[MAX_SLICE_SEGMENTS_IN_FRAME][MAX_DPB_SIZE];
+
+}mv_buf_t;
+
+typedef struct
+{
+ /**
+ * Pointer to current PPS
+ */
+ pps_t *ps_pps;
+
+ /**
+ * Pointer to current SPS
+ */
+ sps_t *ps_sps;
+
+ /**
+ * Pointer to current slice header structure
+ */
+ slice_header_t *ps_slice_hdr;
+
+ /**
+ * CTB's x position within a picture in raster scan in CTB units
+ */
+ WORD32 i4_ctb_x;
+
+ /**
+ * CTB's y position within a picture in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_y;
+
+ /**
+ * Current PU structure - set to CTB pu_t pointer at the start of CTB processing and incremented
+ * for every TU
+ */
+ pu_t *ps_pu;
+
+ /**
+ * Pointer to frame level pu_t for the current frame being parsed
+ * where MVs and Intra pred modes will be updated
+ */
+ pu_t *ps_pic_pu;
+
+ /**
+ * Store the current tile's information. This is needed for the computation of mvs.
+ */
+ tile_t *ps_tile;
+
+ /**
+ * Points to an array of PU indices which is used to identify
+ * start index of pu_t in ps_pic_pu and also to identify number of
+ * PUs in the current CTB by subtracting current idx from next CTB's
+ * PU idx
+ */
+ UWORD32 *pu4_pic_pu_idx;
+
+ /** PU Index map per CTB. The indices in this map are w.r.t picture pu array and not
+ * w.r.t CTB pu array.
+ * This will be used during mv prediction and since neighbours will have different CTB pu map
+ * it will be easier if they all have indices w.r.t picture level PU array rather than CTB level
+ * PU array.
+ * pu1_pic_pu_map is map w.r.t CTB's pu_t array
+ */
+ UWORD32 *pu4_pic_pu_idx_map;
+
+ /**
+ * Pointer to pu_map for the current frame being parsed
+ * where MVs and Intra pred modes will be updated
+ */
+ UWORD8 *pu1_pic_pu_map;
+
+ /**
+ * PU count in current CTB
+ */
+ WORD32 i4_ctb_pu_cnt;
+
+ /**
+ * PU count in current CTB
+ */
+ WORD32 i4_ctb_start_pu_idx;
+
+ /**
+ * Top availability for current CTB level
+ */
+ UWORD8 u1_top_ctb_avail;
+
+ /**
+ * Top right availability for current CTB level
+ */
+ UWORD8 u1_top_rt_ctb_avail;
+ /**
+ * Top left availability for current CTB level
+ */
+ UWORD8 u1_top_lt_ctb_avail;
+ /**
+ * left availability for current CTB level
+ */
+ UWORD8 u1_left_ctb_avail;
+
+}mv_ctxt_t;
+
+typedef struct
+{
+ /**
+ * Pointer to current PPS
+ */
+ pps_t *ps_pps;
+
+ /**
+ * Pointer to current SPS
+ */
+ sps_t *ps_sps;
+
+ /*
+ * Pointer to codec context
+ */
+ codec_t *ps_codec;
+
+ /**
+ * Index of the current Tile being parsed
+ */
+ tile_t *ps_tile;
+
+ /**
+ * Pointer to the current slice header
+ */
+ slice_header_t *ps_slice_hdr;
+
+ /**
+ * TU count in current CTB
+ */
+ WORD32 i4_ctb_tu_cnt;
+
+ /**
+ * CTB's x position within a picture in raster scan in CTB units
+ */
+ WORD32 i4_ctb_x;
+
+ /**
+ * CTB's y position within a picture in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_y;
+
+ /**
+ * CTB's x position within a Tile in raster scan in CTB units
+ */
+ WORD32 i4_ctb_tile_x;
+
+ /**
+ * CTB's y position within a Tile in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_tile_y;
+
+ /**
+ * CTB's x position within a Slice in raster scan in CTB units
+ */
+ WORD32 i4_ctb_slice_x;
+
+ /**
+ * CTB's y position within a Slice in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_slice_y;
+
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *pu4_pic_vert_bs;
+
+ /**
+ * Horizontal Boundary strength
+ */
+
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *pu4_pic_horz_bs;
+
+ /**
+ * Flags to indicate if QP is constant through out a CTB - 1 bit for each CTB
+ * The bits are packed from LSB to MSB
+ * To get the flag corresponding to CTB with (ctb_x, ctb_y), use
+ * pu4_qp_const_in_ctb[(ctb_x + pic_wd_in_ctb * ctb_y) >> 3] & (1 << ((ctb_x + pic_wd_in_ctb * ctb_y) & 7))
+ */
+ UWORD8 *pu1_pic_qp_const_in_ctb;
+
+ /**
+ * Qp array stored for each 8x8 pixels
+ */
+ UWORD8 *pu1_pic_qp;
+
+ /**
+ * Current TU structure - set to CTB tu_t pointer at the start of CTB processing and incremented
+ * for every TU
+ */
+ tu_t *ps_tu;
+
+ /**
+ * Points to an array of TU indices which is used to identify
+ * start index of tu_t in ps_pic_tu and also to identify number of
+ * TUs in the current CTB by subtracting current idx from next CTB's
+ * TU idx
+ */
+ UWORD32 *pu4_pic_tu_idx;
+
+ /**
+ * Points to an array of PU indices which is used to identify
+ * start index of pu_t in ps_pic_pu and also to identify number of
+ * PUs in the current CTB by subtracting current idx from next CTB's
+ * PU idx
+ */
+ UWORD32 *pu4_pic_pu_idx;
+
+ /**
+ * Current PU structure - set to CTB pu_t pointer at the start of CTB processing and incremented
+ * for every TU
+ */
+ pu_t *ps_pu;
+
+ /**
+ * Pointer to frame level pu_t for the current frame being parsed
+ * where MVs and Intra pred modes will be updated
+ */
+ pu_t *ps_pic_pu;
+
+ /** PU Index map per CTB. The indices in this map are w.r.t picture pu array and not
+ * w.r.t CTB pu array.
+ * This will be used during mv prediction and since neighbours will have different CTB pu map
+ * it will be easier if they all have indices w.r.t picture level PU array rather than CTB level
+ * PU array.
+ * pu1_pic_pu_map is map w.r.t CTB's pu_t array
+ */
+ UWORD32 *pu4_pic_pu_idx_map;
+
+ /**
+ * Variable to store the next ctb count to compute pu idx
+ */
+ WORD32 i4_next_pu_ctb_cnt;
+
+ /**
+ * Variable to store the next ctb count to compute tu idx
+ */
+ WORD32 i4_next_tu_ctb_cnt;
+ /**
+ * Points to the array of slice indices which is used to identify the slice
+ * to which each CTB in a frame belongs.
+ */
+ UWORD16 *pu1_slice_idx;
+}bs_ctxt_t;
+
+typedef struct
+{
+ /**
+ * Pointer to current PPS
+ */
+ pps_t *ps_pps;
+
+ /**
+ * Pointer to current SPS
+ */
+ sps_t *ps_sps;
+
+ /*
+ * Pointer to codec context
+ */
+ codec_t *ps_codec;
+
+ /**
+ * Pointer to current slice header structure
+ */
+ slice_header_t *ps_slice_hdr;
+
+ /**
+ * Pointer to the structure that contains BS and QP frame level arrays
+ */
+ bs_ctxt_t s_bs_ctxt;
+
+ /**
+ * CTB's x position within a picture in raster scan in CTB units
+ */
+ WORD32 i4_ctb_x;
+
+ /**
+ * CTB's y position within a picture in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_y;
+
+ /**
+ * Current pictures loop filter flag map at 8x8 level
+ */
+ UWORD8 *pu1_pic_no_loop_filter_flag;
+
+ /**
+ * Current CTB's no_loop_filter_flags
+ * each element corresponds to one row - including the left CTB's last 8x8
+ */
+ UWORD16 au2_ctb_no_loop_filter_flag[9];
+
+ /*
+ * Pointer to 0th luma pixel in current pic
+ */
+ UWORD8 *pu1_cur_pic_luma;
+
+ /*
+ * Pointer to 0th chroma pixel in current pic
+ */
+ UWORD8 *pu1_cur_pic_chroma;
+
+ /* Points to the array of slice indices which is used to identify the slice
+ * to which each CTB in a frame belongs.
+ */
+ UWORD16 *pu1_slice_idx;
+
+ /* Specifies if the chroma format is yuv420sp_vu */
+ WORD32 is_chroma_yuv420sp_vu;
+
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ /**
+ * Pointer to base slice header structure
+ */
+ slice_header_t *ps_slice_hdr_base;
+#endif
+}deblk_ctxt_t;
+
+typedef struct
+{
+ /**
+ * Pointer to current PPS
+ */
+ pps_t *ps_pps;
+
+ /**
+ * Pointer to current SPS
+ */
+ sps_t *ps_sps;
+
+ /* Pointer to codec context
+ *
+ */
+ codec_t *ps_codec;
+
+ /**
+ * Pointer to base slice header structure
+ */
+ slice_header_t *ps_slice_hdr_base;
+
+ /**
+ * Pointer to current slice header structure
+ */
+ slice_header_t *ps_slice_hdr;
+
+ /**
+ * Pointer to current tile structure
+ */
+ tile_t *ps_tile;
+ /**
+ * CTB's x position within a picture in raster scan in CTB units
+ */
+ WORD32 i4_ctb_x;
+
+ /**
+ * CTB's y position within a picture in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_y;
+
+ /**
+ * Current pictures loop filter flag map at 8x8 level
+ */
+ UWORD8 *pu1_pic_no_loop_filter_flag;
+
+ /*
+ * Pointer to 0th luma pixel in current pic
+ */
+ UWORD8 *pu1_cur_pic_luma;
+
+ /*
+ * Pointer to 0th chroma pixel in current pic
+ */
+ UWORD8 *pu1_cur_pic_chroma;
+
+ /**
+ * Pointer to frame level sao_t for the current frame being parsed
+ */
+ sao_t *ps_pic_sao;
+
+ /**
+ * Temporary buffer needed during SAO processing
+ */
+ UWORD8 *pu1_tmp_buf_luma;
+
+ /**
+ * Temporary buffer needed during SAO processing
+ */
+ UWORD8 *pu1_tmp_buf_chroma;
+
+ /**
+ * Left column of luma pixels - used by SAO
+ */
+ UWORD8 *pu1_sao_src_left_luma;
+
+ /**
+ * Top row of luma pixels - used by SAO
+ */
+ UWORD8 *pu1_sao_src_top_luma;
+
+ /**
+ * Left column of chroma pixels(interleaved) - used by SAO
+ */
+ UWORD8 *pu1_sao_src_left_chroma;
+
+ /**
+ * Top row of chroma pixels(interleaved) - used by SAO
+ */
+ UWORD8 *pu1_sao_src_top_chroma;
+
+ /**
+ * Top-left luma pixel - used by SAO (for the top CTB row)
+ */
+ UWORD8 *pu1_sao_src_luma_top_left_ctb;
+
+ /**
+ * Top-left chroma pixel(interleaved) - used by SAO (for the top CTB row)
+ */
+ UWORD8 *pu1_sao_src_chroma_top_left_ctb;
+
+ /**
+ * Top-left luma pixel - used by SAO (for the current CTB row)
+ */
+ UWORD8 *pu1_sao_src_top_left_luma_curr_ctb;
+
+ /**
+ * Top-left chroma pixel(interleaved) - used by SAO (for the current CTB row)
+ */
+ UWORD8 *pu1_sao_src_top_left_chroma_curr_ctb;
+
+ /**
+ * Top-right luma pixel - used by SAO (for the top CTB row)
+ */
+ UWORD8 *pu1_sao_src_top_left_luma_top_right;
+
+ /**
+ * Top-right chroma pixel(interleaved) - used by SAO (for the top CTB row)
+ */
+ UWORD8 *pu1_sao_src_top_left_chroma_top_right;
+
+ /**
+ * Bottom-left luma pixel - used by SAO
+ */
+ UWORD8 u1_sao_src_top_left_luma_bot_left;
+ /**
+ * Pointer to array that stores bottom left luma pixel per row(interleaved) - used by SAO
+ */
+ UWORD8 *pu1_sao_src_top_left_luma_bot_left;
+
+ /**
+ * Bottom left chroma pixel(interleaved) - used by SAO
+ */
+ UWORD8 au1_sao_src_top_left_chroma_bot_left[2];
+ /**
+ * Pointer to array that stores bottom left chroma pixel per row(interleaved) - used by SAO
+ */
+ UWORD8 *pu1_sao_src_top_left_chroma_bot_left;
+
+ /*
+ * Slice counter in a picture.
+ */
+ UWORD32 i4_cur_slice_idx;
+ /**
+ * Points to the array of slice indices which is used to identify the slice
+ * to which each CTB in a frame belongs.
+ */
+ UWORD16 *pu1_slice_idx;
+ /**
+ * Points to the array of tile indices which is used to identify the slice
+ * to which each CTB in a frame belongs.
+ */
+ UWORD16 *pu1_tile_idx;
+
+ /* Specifies if the chroma format is yuv420sp_vu */
+ WORD32 is_chroma_yuv420sp_vu;
+
+}sao_ctxt_t;
+
+typedef struct
+{
+ /** Log2 CU's size */
+ WORD32 i4_log2_cb_size;
+
+ /** CU's x position */
+ WORD32 i4_pos_x;
+
+ /** CU's y position */
+ WORD32 i4_pos_y;
+ /**
+ * Transquant Bypass enable flag at CU level - To be replicated at TU level
+ */
+ WORD32 i4_cu_transquant_bypass;
+ /**
+ * Prediction mode
+ */
+ WORD32 i4_pred_mode;
+
+ /**
+ * Partition mode
+ */
+ WORD32 i4_part_mode;
+
+ /**
+ * Intra luma pred mode for current CU. In case of PART2Nx2N
+ * the first value is replicated to avoid checks later
+ */
+ WORD32 ai4_intra_luma_pred_mode[4];
+
+ /**
+ * Previous intra luma pred flag used for intra pred mode computation
+ */
+ WORD32 ai4_prev_intra_luma_pred_flag[4];
+
+ /**
+ * mpm index used in intra prediction mode computation
+ */
+ WORD32 ai4_mpm_idx[4];
+ /**
+ * Remaining intra pred mode
+ */
+ WORD32 ai4_rem_intra_luma_pred_mode[4];
+ /**
+ * Chroma pred mode index to be used to compute intra pred mode for chroma
+ */
+ WORD32 i4_intra_chroma_pred_mode_idx;
+ /**
+ * Maximum transform depth
+ */
+ WORD32 i4_max_trafo_depth;
+
+ /**
+ * Luma CBF for current TU
+ */
+ UWORD8 i1_cbf_luma;
+
+ /**
+ * Cb CBF
+ */
+ UWORD8 ai1_cbf_cb[MAX_TRAFO_DEPTH];
+
+ /**
+ * Cr CBF
+ */
+ UWORD8 ai1_cbf_cr[MAX_TRAFO_DEPTH];
+
+ /**
+ * Intra split flag
+ */
+ WORD32 i4_intra_split_flag;
+
+ /**
+ * Current QP
+ */
+ WORD32 i4_qp;
+
+ /**
+ * Number of TUs in CU parsed before a change in QP is signaled
+ */
+ WORD32 i4_tu_cnt;
+
+ /**
+ * Cu QP delta
+ */
+ WORD32 i4_cu_qp_delta;
+
+}parse_cu_t;
+/**
+ * Structure contains few common state variables such as CTB positions, current SPS, PPS ids etc which are to be
+ * used in the parsing thread. By keeping it a different structure it is being explicitly signalled that these
+ * variables are specific to Parsing threads context and other threads should not update these elements
+ */
+typedef struct
+{
+ /**
+ * CTB's x position within a picture in raster scan in CTB units
+ */
+ WORD32 i4_ctb_x;
+
+ /**
+ * CTB's y position within a picture in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_y;
+
+ /**
+ * CTB's x position within a Tile in raster scan in CTB units
+ */
+ WORD32 i4_ctb_tile_x;
+
+ /**
+ * CTB's y position within a Tile in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_tile_y;
+
+ /**
+ * CTB's x position within a Slice in raster scan in CTB units
+ */
+ WORD32 i4_ctb_slice_x;
+
+ /**
+ * CTB's y position within a Slice in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_slice_y;
+
+ /**
+ * Index of the current Tile being parsed
+ */
+ tile_t *ps_tile;
+
+ /**
+ * Current slice idx - Used in multi-core cases to ensure slice header is
+ * preserved till the last CB of the slice is decoded
+ */
+ WORD32 i4_cur_slice_idx;
+ /**
+ * Current slice idx - Used in multi-core cases to ensure slice header is
+ * preserved till the last CB of the slice is decoded
+ */
+ WORD32 i4_cur_independent_slice_idx;
+
+ /**
+ * Current slice idx - Used in multi-core cases to ensure slice header is
+ * preserved till the last CB of the slice is decoded
+ */
+ WORD32 i4_cur_tile_idx;
+
+ /**
+ * Pointer to current PPS
+ */
+ pps_t *ps_pps;
+
+ /**
+ * Pointer to current SPS
+ */
+ sps_t *ps_sps;
+
+ /**
+ * Signal that pic_init is called first time
+ */
+ WORD32 i4_first_pic_init;
+
+ /**
+ * Flag to indicate if CU QP delta is coded.
+ * By default it is set to 0 at the beginning of coding quad tree
+ */
+ WORD32 i4_is_cu_qp_delta_coded;
+
+ /**
+ * CU Qp delta
+ * By default it is set to 0 at the beginning of coding quad tree
+ */
+ WORD32 i4_cu_qp_delta;
+
+ /**
+ * Bitstream structure
+ */
+ bitstrm_t s_bitstrm;
+
+ /**
+ * Pointer frame level TU subblock coeff data
+ */
+ void *pv_pic_tu_coeff_data;
+
+ /**
+ * Pointer to TU subblock coeff data and number of coded subblocks and scan idx
+ * Incremented each time a coded subblock is parsed
+ *
+ */
+ void *pv_tu_coeff_data;
+
+ /**
+ * Current TU structure - set to CTB tu_t pointer at the start of CTB parsing and incremented
+ * for every TU
+ */
+ tu_t *ps_tu;
+
+ /**
+ * Current ctb's TU map
+ */
+ UWORD8 *pu1_tu_map;
+
+ /**
+ * Current PU structure - set to CTB pu_t pointer at the start of CTB parsing and incremented
+ * for every TU
+ */
+ pu_t *ps_pu;
+
+ /**
+ * Points to the array of slice indices which is used to identify the independent slice
+ * to which each CTB in a frame belongs.
+ */
+ UWORD16 *pu1_slice_idx;
+
+ /**
+ * Current PU index in a frame
+ */
+ WORD32 i4_pic_pu_idx;
+
+ /**
+ * Current TU index in a frame
+ */
+ WORD32 i4_pic_tu_idx;
+
+ /**
+ * Current PU structure - set to CTB pu_map pointer at the start of CTB parsing
+ */
+ UWORD8 *pu1_pu_map;
+
+ /**
+ * Current QP
+ */
+ WORD32 u4_qp;
+
+ /**
+ * Current Group's QP
+ */
+ WORD32 u4_qpg;
+
+ /**
+ * Number of PCM blocks in current CTB - Needed only during parsing
+ * If needed during recon then move it to ctb_t
+ */
+ WORD32 i4_ctb_num_pcm_blks;
+
+ /**
+ * PCM flag for the current CU
+ */
+ WORD32 i4_cu_pcm_flag;
+
+ /**
+ * CU related information to be used to populate tu_t and pu_t during
+ * pred unit and transform tree parsing.
+ */
+ parse_cu_t s_cu;
+
+ /**
+ * Pointer to pu_map for the current frame being parsed
+ */
+ UWORD8 *pu1_pic_pu_map;
+
+ /**
+ * Pointer to frame level pu_t for the current frame being parsed
+ * where MVs and Intra pred modes will be updated
+ */
+ pu_t *ps_pic_pu;
+
+ /**
+ * Pointer to tu_map for the current frame being parsed
+ */
+ UWORD8 *pu1_pic_tu_map;
+
+ /**
+ * Pointer to frame level tu_t for the current frame being parsed
+ * where transform unit related info will be updated
+ */
+ tu_t *ps_pic_tu;
+
+ /**
+ * Points to an array of TU indices which is used to identify
+ * start index of tu_t in ps_pic_tu and also to identify number of
+ * TUs in the current CTB by subtracting current idx from next CTB's
+ * TU idx
+ */
+ UWORD32 *pu4_pic_tu_idx;
+
+ /**
+ * Points to an array of PU indices which is used to identify
+ * start index of pu_t in ps_pic_pu and also to identify number of
+ * PUs in the current CTB by subtracting current idx from next CTB's
+ * PU idx
+ */
+ UWORD32 *pu4_pic_pu_idx;
+
+
+ /**
+ * Current pictures intra mode map at 8x8 level
+ */
+ UWORD8 *pu1_pic_intra_flag;
+
+ /**
+ * Current pictures loop filter flag map at 8x8 level
+ */
+ UWORD8 *pu1_pic_no_loop_filter_flag;
+
+ /**
+ * Array to hold one row (top) of skip_flag flag stored at (8x8) level
+ * 1 bit per (8x8)
+ * read and written as a UWORD32
+ * LSB gives skip_flag for 0th 8x8 and MSB gives skip_flag for 31st 8x8 and so on
+ * This is independent of CTB size or minCU size
+ * Packed format requires extra calculations in extracting required bits but makes it easier
+ * to store skip data for larger sizes such as 32 x 32 where 4 bits need to be set instead of
+ * 4 bytes or for 64 x 64 where 8 bits need to be set instead of 8 bytes.
+ */
+ UWORD32 *pu4_skip_cu_top;
+
+ /**
+ * Array to hold one 64 pixel column (left) of skip_flag flag stored at (8x8) level
+ * 1 bit per (8x8)
+ * read and written as a UWORD32
+ * LSB gives skip_flag for 0th 8x8 and MSB gives skip for 31st 8x8 and so on
+ * This is independent of CTB size and allocated to store data for 64 pixels, of
+ * this only first ctb_size number of bits (starting from MSB) will have valid data
+ * This is also independent of min CU size and data is stored at 8x8 level.
+ * Since only 8 bits are needed to represent left 64 pixels at 8x8 level, this is not an array
+ */
+ UWORD32 u4_skip_cu_left;
+
+ /**
+ * Array to hold one row (top) of coding_tree_depth stored at (8x8) level
+ * 2 bits per (8x8) pixels
+ * read and written as a WORD32
+ * 2 LSBits give coding_tree_depth for 0th 8x8 and 2 MSBits give coding_tree_depth for 15th 8x8 and so on
+ * This is independent of CTB size or minCU size
+ */
+ UWORD32 *pu4_ct_depth_top;
+
+ /**
+ * Array to hold one 64 pixel column (left) of coding_tree_depth stored at (8x8) level
+ * 2 bits per (8x8) pixels
+ * read and written as a WORD32
+ * 2 LSBits give coding_tree_depth for 0th 8x8 and 2 MSBits give coding_tree_depth for 15th 8x8 and so on
+ * This is independent of CTB size and allocated to store data for 64 pixels, of
+ * this only first ctb_size * 2 number of bits (starting from MSB) will have valid data
+ * This is also independent of min CU size and data is stored at 8x8 level.
+ * Since only 16 bits are needed to represent left 64 pixels at 8x8 level, this is not an array
+ */
+ UWORD32 u4_ct_depth_left;
+
+ /**
+ * Array to hold top (one row) luma_intra_pred_mode stored at (4x4) level for a CTB
+ * 8 bits per (4x4) pixels
+ * read and written as a UWORD8
+ * This is independent of CTB size or minCU size
+ * This is independent of CTB size and allocated to store data for 64 pixels i.e. 64 bits is the size
+ * Note this data is used only within a CTB, There is no inter CTB dependencies for this
+ */
+ UWORD8 *pu1_luma_intra_pred_mode_top;
+
+ /**
+ * Array to hold left (one column) luma_intra_pred_mode stored at (4x4) level for a CTB
+ * 8 bits per (4x4) pixels
+ * read and written as a UWORD8
+ * This is independent of CTB size and allocated to store data for 64 pixels i.e. 64 bits is the size
+ * This is also independent of min CU size and data is stored at 8x8 level.
+ * This is used for prediction of next CTB within a row in a slice or tile
+ */
+ UWORD8 *pu1_luma_intra_pred_mode_left;
+
+
+ /**
+ * Pointer to base of Video parameter set structure array
+ */
+ vps_t *ps_vps_base;
+
+ /**
+ * Pointer to base of Sequence parameter set structure array
+ */
+ sps_t *ps_sps_base;
+
+ /**
+ * Pointer to base of Picture parameter set structure array
+ */
+ pps_t *ps_pps_base;
+
+ /**
+ * Pointer to base of slice header structure array
+ */
+ slice_header_t *ps_slice_hdr_base;
+
+ /**
+ * Pointer to current slice header structure
+ */
+ slice_header_t *ps_slice_hdr;
+
+
+ /**
+ * Error code during parse stage
+ */
+ WORD32 i4_error_code;
+
+ /**
+ * Void pointer to process job context
+ */
+ void *pv_proc_jobq;
+
+ /* Cabac context */
+ cab_ctxt_t s_cabac;
+
+ /* Current Coding tree depth */
+ WORD32 i4_ct_depth;
+
+ /** Flag to signal end of frame */
+ WORD32 i4_end_of_frame;
+
+ /**
+ * Index of the next CTB parsed
+ */
+ WORD32 i4_next_ctb_indx;
+
+ /**
+ * Pointer to the structure that contains BS and QP frame level arrays
+ */
+ bs_ctxt_t s_bs_ctxt;
+
+ /**
+ * Pointer to the structure that contains deblock context
+ */
+ deblk_ctxt_t s_deblk_ctxt;
+
+ /**
+ * Pointer to the structure that contains sao context
+ */
+ sao_ctxt_t s_sao_ctxt;
+
+ /**
+ * QP Array for the current CTB
+ * Used in QP prediction
+ */
+ WORD8 ai1_8x8_cu_qp[MAX_CU_IN_CTB];
+
+
+ /**
+ * Pointer to frame level sao_t for the current frame being parsed
+ */
+ sao_t *ps_pic_sao;
+
+ /**
+ * Abs POC count of the frame
+ */
+ WORD32 i4_abs_pic_order_cnt;
+
+ /**
+ * Pointer points to mv_buffer of current frame
+ */
+ mv_buf_t *ps_cur_mv_buf;
+
+ /**
+ * Variable to store the next ctb count to compute pu idx
+ */
+ WORD32 i4_next_pu_ctb_cnt;
+
+ /**
+ * Variable to store the next ctb count to compute tu idx
+ */
+ WORD32 i4_next_tu_ctb_cnt;
+
+
+}parse_ctxt_t;
+
+/**
+ * Pixel processing thread context
+ */
+
+typedef struct
+{
+ /* Pointer to codec context
+ *
+ */
+ codec_t *ps_codec;
+
+ /**
+ * CTB's x position within a picture in raster scan in CTB units
+ */
+ WORD32 i4_ctb_x;
+
+ /**
+ * CTB's y position within a picture in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_y;
+
+ /**
+ * CTB's x position within a Tile in raster scan in CTB units
+ */
+ WORD32 i4_ctb_tile_x;
+
+ /**
+ * CTB's y position within a Tile in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_tile_y;
+
+ /**
+ * CTB's x position within a Slice in raster scan in CTB units
+ */
+ WORD32 i4_ctb_slice_x;
+
+ /**
+ * CTB's y position within a Slice in raster scan in CTB units
+ */
+
+ WORD32 i4_ctb_slice_y;
+
+ /**
+ * Current tile being processed
+ */
+ tile_t *ps_tile;
+
+ /**
+ * Current slice idx - Used in multi-core cases to store slice index for
+ * each ctb for sao filtering.
+ */
+ WORD32 i4_cur_slice_idx;
+
+ /**
+ * Current tile idx - Used in multi-core cases to store tile index for
+ * each ctb for sao filtering.
+ */
+ WORD32 i4_cur_tile_idx;
+ /**
+ * Pointer to current PPS
+ */
+ pps_t *ps_pps;
+
+ /**
+ * Pointer to current SPS
+ */
+ sps_t *ps_sps;
+
+ /**
+ * Pointer to current slice header structure
+ */
+ slice_header_t *ps_slice_hdr;
+
+ /**
+ * Error code during parse stage
+ */
+ WORD32 i4_error_code;
+
+ /**
+ * Signal that pic_init is called first time
+ */
+ WORD32 i4_first_pic_init;
+
+ /**
+ * Pointer frame level TU subblock coeff data
+ */
+ void *pv_pic_tu_coeff_data;
+
+ /**
+ * Pointer to TU subblock coeff data and number of subblocks and scan idx
+ * Incremented each time a coded subblock is processed
+ *
+ */
+ void *pv_tu_coeff_data;
+
+ /**
+ * Current TU structure - set to CTB tu_t pointer at the start of CTB processing and incremented
+ * for every TU
+ */
+ tu_t *ps_tu;
+
+ /**
+ * Current ctb's TU map
+ */
+ UWORD8 *pu1_tu_map;
+
+ /**
+ * Current PU structure - set to CTB pu_t pointer at the start of CTB processing and incremented
+ * for every TU
+ */
+ pu_t *ps_pu;
+
+ /**
+ * Points to an array of TU indices which is used to identify
+ * start index of tu_t in ps_pic_tu and also to identify number of
+ * TUs in the current CTB by subtracting current idx from next CTB's
+ * TU idx
+ */
+ UWORD32 *pu4_pic_tu_idx;
+
+ /**
+ * Points to an array of PU indices which is used to identify
+ * start index of pu_t in ps_pic_pu and also to identify number of
+ * PUs in the current CTB by subtracting current idx from next CTB's
+ * PU idx
+ */
+ UWORD32 *pu4_pic_pu_idx;
+
+ /**
+ * Pointer to tu_map for the current frame being parsed
+ */
+ UWORD8 *pu1_pic_tu_map;
+
+ /**
+ * Pointer to pu_map for the current frame being parsed
+ * where MVs and Intra pred modes will be updated
+ */
+ UWORD8 *pu1_pic_pu_map;
+
+ /**
+ * Pointer to frame level pu_t for the current frame being parsed
+ * where MVs and Intra pred modes will be updated
+ */
+ pu_t *ps_pic_pu;
+
+ /** PU Index map per CTB. The indices in this map are w.r.t picture pu array and not
+ * w.r.t CTB pu array.
+ * This will be used during mv prediction and since neighbours will have different CTB pu map
+ * it will be easier if they all have indices w.r.t picture level PU array rather than CTB level
+ * PU array.
+ * pu1_pic_pu_map is map w.r.t CTB's pu_t array
+ */
+ UWORD32 *pu4_pic_pu_idx_map;
+
+ /**
+ * PU Index of top 4x4 neighbors stored for an entire row
+ */
+ UWORD32 *pu4_pic_pu_idx_top;
+
+ /**
+ * PU Index of left 4x4 neighbors stored for 64 pixels
+ */
+ UWORD32 *pu4_pic_pu_idx_left;
+
+ /**
+ * Holds top left PU index at CTB level - top left gets overwritten
+ * by left CTB while updating top array. Before updating top at CTB
+ * level required top-left index is backed up in the following
+ */
+ UWORD32 u4_ctb_top_left_pu_idx;
+
+ /**
+ * Pointer to frame level tu_t for the current frame being parsed
+ * where transform unit related info will be updated
+ */
+ tu_t *ps_pic_tu;
+
+
+ /**
+ * Current PU structure - set to CTB pu_map pointer at the start of CTB parsing
+ */
+ UWORD8 *pu1_pu_map;
+
+ /** Current MV Bank's buffer ID */
+ WORD32 i4_cur_mv_bank_buf_id;
+
+ /**
+ * Current pictures intra mode map at 8x8 level
+ */
+ UWORD8 *pu1_pic_intra_flag;
+
+ /**
+ * Current pictures loop filter flag map at 8x8 level
+ */
+ UWORD8 *pu1_pic_no_loop_filter_flag;
+
+ /**
+ * Void pointer to process job context
+ */
+
+ void *pv_proc_jobq;
+
+ /**
+ * Number of CTBs to be processed in the current Job
+ */
+ WORD32 i4_ctb_cnt;
+ /**
+ * ID for the current context - Used for debugging
+ */
+ WORD32 i4_id;
+
+ /**
+ * Flag to indicate if parsing status has to be checked
+ * Needed when parsing and processing are done in different threads
+ */
+ WORD32 i4_check_parse_status;
+
+ /**
+ * Flag to indicate if processing status of top row CTBs has to be checked
+ * Needed when processing of different rows is done in different threads
+ */
+ WORD32 i4_check_proc_status;
+
+ /**
+ * Holds Intra dequantization matrices
+ */
+ WORD16 *api2_dequant_intra_matrix[4];
+
+ /**
+ * Holds Inter dequantization matrices
+ */
+ WORD16 *api2_dequant_inter_matrix[4];
+
+
+ /**
+ * Temporary buffer 1 - Used as a scratch in inter_pred_ctb()
+ */
+ WORD16 *pi2_inter_pred_tmp_buf1;
+
+ /**
+ * Temporary buffer 2 - Used as a scratch in inter_pred_ctb()
+ */
+ WORD16 *pi2_inter_pred_tmp_buf2;
+
+ /**
+ * Temporary buffer 3 - Used as a scratch in inter_pred_ctb()
+ */
+ WORD16 *pi2_inter_pred_tmp_buf3;
+
+ /**
+ * The above temporary buffers' stride
+ */
+ WORD32 i4_inter_pred_tmp_buf_strd;
+ /**
+ * Picture stride
+ * Used as prediction stride, destination stride while computing inverse transform
+ */
+ WORD32 i4_pic_strd;
+
+ /**
+ * Picture qp offset for U
+ */
+ WORD8 i1_pic_cb_qp_offset;
+
+ /**
+ * Slice qp offset for U
+ */
+ WORD32 i1_slice_cb_qp_offset;
+
+ /**
+ * Picture qp offset for V
+ */
+ WORD8 i1_pic_cr_qp_offset;
+
+ /**
+ * Slice qp offset for V
+ */
+ WORD32 i1_slice_cr_qp_offset;
+
+ /** Pointer to current picture buffer structure */
+ pic_buf_t *ps_cur_pic;
+
+ /** Current pic_buf's picture buffer id */
+ WORD32 i4_cur_pic_buf_id;
+
+ /** Pointer to 0th luma pixel in current pic */
+ UWORD8 *pu1_cur_pic_luma;
+
+ /** Pointer to 0th chroma pixel in current pic */
+ UWORD8 *pu1_cur_pic_chroma;
+
+ /** Intermediate buffer to be used during inverse transform */
+ WORD16 *pi2_itrans_intrmd_buf;
+
+ /** Buffer to hold output of inverse scan */
+ WORD16 *pi2_invscan_out;
+
+ /**
+ * Top availability for current CTB level
+ */
+ UWORD8 u1_top_ctb_avail;
+
+ /**
+ * Top right availability for current CTB level
+ */
+ UWORD8 u1_top_rt_ctb_avail;
+ /**
+ * Top left availability for current CTB level
+ */
+ UWORD8 u1_top_lt_ctb_avail;
+ /**
+ * left availability for current CTB level
+ */
+ UWORD8 u1_left_ctb_avail;
+ /**
+ * TU count in current CTB
+ */
+ WORD32 i4_ctb_tu_cnt;
+
+ /**
+ * Recon pointer to current CTB luma
+ */
+ UWORD8 *pu1_cur_ctb_luma;
+ /**
+ * Recon pointer to current CTB chroma
+ */
+ UWORD8 *pu1_cur_ctb_chroma;
+
+ /**
+ * PU count in current CTB
+ */
+ WORD32 i4_ctb_pu_cnt;
+
+ /**
+ * PU count in current CTB
+ */
+ WORD32 i4_ctb_start_pu_idx;
+
+ /* Pointer to a structure describing output display buffer */
+ ivd_out_bufdesc_t *ps_out_buffer;
+
+ /** Flag to indicate if ps_proc was intialized at least once in a frame.
+ * This is needed to handle cases where a core starts to handle format conversion jobs directly
+ */
+ WORD32 i4_init_done;
+
+ /**
+ * Pointer to the structure that contains BS and QP frame level arrays
+ */
+ bs_ctxt_t s_bs_ctxt;
+
+ /**
+ * Pointer to the structure that contains deblock context
+ */
+ deblk_ctxt_t s_deblk_ctxt;
+
+ /**
+ * Pointer to the structure that contains sao context
+ */
+ sao_ctxt_t s_sao_ctxt;
+
+ /**
+ * Points to the array of slice indices which is used to identify the independent
+ * slice to which each CTB in a frame belongs.
+ */
+ UWORD16 *pu1_slice_idx;
+
+ /**
+ * Points to the array of slice indices which is used to identify the slice
+ * to which each CTB in a frame belongs.
+ */
+ UWORD16 *pu1_tile_idx;
+ /**
+ * Variable to store the next ctb count to compute pu idx
+ */
+ WORD32 i4_next_pu_ctb_cnt;
+
+ /**
+ * Variable to store the next ctb count to compute tu idx
+ */
+ WORD32 i4_next_tu_ctb_cnt;
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ /** Process status: one byte per CTB */
+ UWORD8 *pu1_proc_map;
+#endif
+#ifdef GPU_BUILD
+ UWORD32 u4_gpu_inter_flag;
+#endif
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ /**
+ * Pointer to base slice header structure
+ */
+ slice_header_t *ps_slice_hdr_base;
+#endif
+ /**
+ * Number of ctb's to process in one loop
+ */
+ WORD32 i4_nctb;
+}process_ctxt_t;
+#ifdef GPU_BUILD
+typedef struct
+{
+ /** Pointer to private GPU memory */
+ void *pv_gpu_priv;
+
+ /**
+ * Array that contains the no of ctbs in each grain of the frame.
+ * Right now maximum no of grains in frame hardcoded to 16.
+ */
+ WORD32 ai4_ctbs_in_grain[16];
+
+ /**
+ * Array that contains the height of each grain of the frame in ctbs.
+ * Right now maximum no of grains in frame hardcoded to 16.
+ */
+ WORD32 ai4_grain_ht_in_ctb[16];
+
+ /**
+ * Array that contains the X position of each grain in the current
+ * frame in CTB units
+ */
+ WORD32 ai4_grain_pos_y[16];
+
+ /**
+ * Variables to store maximum extend of motion vectors in for the current grain.
+ */
+ //WORD32 i4_max_pu_y;
+ //WORD32 i4_max_pu_x;
+
+ /**
+ * Parameter that holds current grain index.
+ */
+ WORD32 i4_curr_grain_idx;
+
+ /**
+ * Arry to store coefficient offsets for each ctb row.
+ * Currently allocated for frame width of 4096(ctb size 16 * 256).
+ */
+ WORD32 ai4_tu_coeff_data_ofst[256];
+
+ /**
+ * Arry to store slice id for at the beginning of each ctb row.
+ * Currently allocated for frame width of 4096(ctb size 16 * 256).
+ */
+ WORD32 ai4_cur_slice_idx[256];
+
+ /**
+ * Variable to keep track of no of ctbs parsed in the current frame grain.
+ */
+ WORD32 i4_curr_grain_ctb_cnt;
+}gpu_ctxt_t;
+#endif
+
+typedef void (*pf_inter_pred)(void *,
+ void *,
+ WORD32,
+ WORD32,
+ WORD8 *,
+ WORD32,
+ WORD32);
+
+
+typedef void (*pf_intra_pred)(UWORD8 *pu1_ref,
+ WORD32 src_strd,
+ UWORD8 *pu1_dst,
+ WORD32 dst_strd,
+ WORD32 nt,
+ WORD32 mode);
+
+typedef void (*pf_itrans_recon)(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+typedef void (*pf_recon)(WORD16 *pi2_src,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols);
+
+typedef void (*pf_itrans_recon_dc)(UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 log2_trans_size,
+ WORD16 i2_coeff_value);
+
+
+typedef void (*pf_sao_luma)(UWORD8 *,
+ WORD32,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ WORD8 *,
+ WORD32,
+ WORD32);
+
+typedef void (*pf_sao_chroma)(UWORD8 *,
+ WORD32,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ UWORD8 *,
+ WORD8 *,
+ WORD8 *,
+ WORD32,
+ WORD32);
+
+/**
+ * Codec context
+ */
+
+struct _codec_t
+{
+ /**
+ * Max width the codec can support
+ */
+ WORD32 i4_max_wd;
+
+ /**
+ * Max height the codec can support
+ */
+ WORD32 i4_max_ht;
+
+ /**
+ * Width : pic_width_in_luma_samples
+ */
+ WORD32 i4_wd;
+
+ /**
+ * Height : pic_height_in_luma_samples
+ */
+ WORD32 i4_ht;
+
+ /**
+ * Display width after cropping
+ */
+ WORD32 i4_disp_wd;
+
+ /**
+ * Display height after cropping
+ */
+ WORD32 i4_disp_ht;
+
+ /**
+ * Display stride
+ */
+ WORD32 i4_disp_strd;
+
+ /**
+ * Stride of reference buffers.
+ * For shared mode even display buffer will use the same stride
+ */
+ WORD32 i4_strd;
+
+ /**
+ * Level specified during init
+ */
+ WORD32 i4_init_level;
+
+ /**
+ * number of reference frames specified during init
+ */
+ WORD32 i4_init_num_ref;
+
+ /**
+ * number of reorder frames specified during init
+ */
+ WORD32 i4_init_num_reorder;
+
+ /**
+ * Number of extra display buffers allocated by application
+ */
+ WORD32 i4_init_num_extra_disp_buf;
+
+ /**
+ * Number of cores to be used
+ */
+ WORD32 i4_num_cores;
+
+ /**
+ * RASL output flag
+ */
+ WORD32 i4_rasl_output_flag;
+
+ /**
+ * Pictures that are are degraded
+ * 0 : No degrade
+ * 1 : Only on non-reference frames
+ * 2 : Use interval specified by u4_nondegrade_interval
+ * 3 : All non-key frames
+ * 4 : All frames
+ */
+ WORD32 i4_degrade_pics;
+
+ /**
+ * Interval for pictures which are completely decoded without any degradation
+ */
+ WORD32 i4_nondegrade_interval;
+
+ /**
+ * bit position (lsb is zero): Type of degradation
+ * 0 : Disable SAO
+ * 1 : Disable deblocking
+ * 2 : Faster inter prediction filters
+ * 3 : Fastest inter prediction filters
+ */
+ WORD32 i4_degrade_type;
+
+ /** Degrade pic count, Used to maintain the interval between non-degraded pics
+ *
+ */
+ WORD32 i4_degrade_pic_cnt;
+
+ /**
+ * Total number of display buffers to be used
+ * In case of shared mode, this will be number of reference frames
+ */
+ WORD32 i4_num_disp_bufs;
+
+ /**
+ * Flag to enable shared display buffer mode
+ */
+ WORD32 i4_share_disp_buf;
+
+ /**
+ * Chroma format of display buffers.
+ In shared mode only 420SP_UV and 420SP_VU are supported
+ */
+ IV_COLOR_FORMAT_T e_chroma_fmt;
+
+ /**
+ * Chroma format of reference buffers.
+ * In non-shared mode it will be 420SP_UV
+ * In shared mode only 420SP_UV and 420SP_VU are supported
+ */
+ IV_COLOR_FORMAT_T e_ref_chroma_fmt;
+
+ /**
+ * Frame skip mode
+ */
+ IVD_FRAME_SKIP_MODE_T e_pic_skip_mode;
+
+ /**
+ * Display or decode order dump of output
+ */
+ IVD_DISPLAY_FRAME_OUT_MODE_T e_pic_out_order;
+
+ /**
+ * Coding type of the picture that is decoded
+ */
+ IV_PICTURE_CODING_TYPE_T e_dec_pic_type;
+
+ /**
+ * Flag to signal if a frame was decoded in this call
+ */
+ WORD32 i4_pic_decoded;
+
+ /**
+ * Flag to signal if picture data is present in the current input bitstream
+ */
+ WORD32 i4_pic_present;
+
+ /**
+ * Flag to disable deblocking of a frame
+ */
+ WORD32 i4_disable_deblk_pic;
+
+ /**
+ * Flag to disable sao of a frame
+ */
+ WORD32 i4_disable_sao_pic;
+
+ /**
+ * Flag to use full pel MC
+ */
+ WORD32 i4_fullpel_inter_pred;
+ /**
+ * Flush mode
+ */
+ WORD32 i4_flush_mode;
+
+ /**
+ * Decode header mode
+ */
+ WORD32 i4_header_mode;
+
+ /**
+ * Header in slice mode
+ */
+ WORD32 i4_header_in_slice_mode;
+
+ /**
+ * Flag to signal sps done
+ */
+ WORD32 i4_sps_done;
+
+ /**
+ * Flag to signal pps done
+ */
+ WORD32 i4_pps_done;
+
+ /**
+ * To signal successful completion of init
+ */
+ WORD32 i4_init_done;
+
+ /**
+ * To signal that at least one picture was decoded
+ */
+ WORD32 i4_first_pic_done;
+
+ /**
+ * To signal error in slice
+ */
+ WORD32 i4_slice_error;
+
+ /**
+ * Reset flag - Codec is reset if this flag is set
+ */
+ WORD32 i4_reset_flag;
+
+ /**
+ * Number of pictures decoded till now
+ */
+ UWORD32 u4_pic_cnt;
+
+ /**
+ * Number of pictures displayed till now
+ */
+ UWORD32 u4_disp_cnt;
+
+ /**
+ * Current error code
+ */
+ WORD32 i4_error_code;
+
+ /**
+ * Pointer to input bitstream. This is incremented everytime a NAL is processed
+ */
+ UWORD8 *pu1_inp_bitsbuf;
+
+ /**
+ * Offset to first byte after the start code in current NAL
+ */
+ WORD32 i4_nal_ofst;
+
+ /**
+ * Length of the NAL unit including the emulation bytes
+ */
+ WORD32 i4_nal_len;
+
+ /**
+ * Number of emulation prevention bytes present in the current NAL
+ */
+ WORD32 i4_num_emln_bytes;
+
+ /**
+ * Number of bytes remaining in the input bitstream
+ */
+ /**
+ * Decremented everytime a NAL is processed
+ */
+ WORD32 i4_bytes_remaining;
+
+ /**
+ * Pointer to bitstream after emulation prevention
+ */
+ UWORD8 *pu1_bitsbuf;
+
+ /**
+ * Size of intermediate bitstream buffer
+ */
+ UWORD32 u4_bitsbuf_size;
+
+ /**
+ * Pointer to hold TU data for a set of CTBs or a picture
+ */
+#ifndef GPU_BUILD
+ void *pv_tu_data;
+#else
+ void *apv_tu_data[2];
+#endif
+ /**
+ * Holds mem records passed during init.
+ * This will be used to return the mem records during retrieve call
+ */
+ iv_mem_rec_t *ps_mem_rec_backup;
+
+ /**
+ * Process Job queue buffer base
+ */
+ void *pv_proc_jobq_buf;
+
+ /**
+ * Process Job Queue mem tab size
+ */
+ WORD32 i4_proc_jobq_buf_size;
+
+ /** Parse status: one byte per CTB */
+ UWORD8 *pu1_parse_map;
+
+ /** Process status: one byte per CTB */
+#ifndef GPU_BUILD
+ UWORD8 *pu1_proc_map;
+#else
+ UWORD8 *apu1_proc_map[2];
+#endif
+ /**
+ * Current pictures intra mode map at 8x8 level
+ */
+#ifndef GPU_BUILD
+ UWORD8 *pu1_pic_intra_flag;
+#else
+ UWORD8 *apu1_pic_intra_flag[2];
+#endif
+ /**
+ * Current pictures loop filter flag map at 8x8 level
+ */
+#ifndef GPU_BUILD
+ UWORD8 *pu1_pic_no_loop_filter_flag;
+#else
+ UWORD8 *apu1_pic_no_loop_filter_flag[2];
+#endif
+ /**
+ * MV Bank buffer manager
+ */
+ void *pv_mv_buf_mgr;
+
+ /**
+ * Pointer to MV Buf structure array
+ */
+ void *ps_mv_buf;
+
+ /**
+ * Base address for Motion Vector bank buffer
+ */
+ void *pv_mv_bank_buf_base;
+
+ /**
+ * MV Bank size allocated
+ */
+ WORD32 i4_total_mv_bank_size;
+
+ /**
+ * Picture buffer manager
+ */
+ void *pv_pic_buf_mgr;
+
+ /**
+ * Pointer to Pic Buf structure array
+ */
+ void *ps_pic_buf;
+
+ /**
+ * Base address for Picture buffer
+ */
+ void *pv_pic_buf_base;
+
+ /**
+ * Total pic buffer size allocated
+ */
+ WORD32 i4_total_pic_buf_size;
+
+
+ /**
+ * Picture buffer manager
+ */
+ void *pv_disp_buf_mgr;
+
+ /**
+ * Current display buffer's buffer ID
+ */
+ WORD32 i4_disp_buf_id;
+
+ /**
+ * Current display buffer
+ */
+ pic_buf_t *ps_disp_buf;
+
+ /**
+ * Pointer to dpb manager structure
+ */
+ void *pv_dpb_mgr;
+
+ /**
+ * Scaling matrices for each PPS
+ */
+ WORD16 *pi2_scaling_mat;
+
+ /**
+ * Array containing Tile information for each PPS
+ */
+ tile_t *ps_tile;
+
+ /**
+ * Timestamp associated with the current display output
+ */
+ UWORD32 u4_ts;
+
+ /**
+ * Pointer to base of Video parameter set structure array
+ */
+ vps_t *ps_vps_base;
+
+ /**
+ * Pointer to base of Sequence parameter set structure array
+ */
+ sps_t *ps_sps_base;
+
+ /**
+ * Pointer to base of Picture parameter set structure array
+ */
+ pps_t *ps_pps_base;
+
+ /**
+ * Pointer to base of slice header structure array
+ */
+#ifndef GPU_BUILD
+ slice_header_t *ps_slice_hdr_base;
+#else
+ slice_header_t *aps_slice_hdr_base[2];
+#endif
+ /**
+ * Pointer to base of entry point offsets in a frame
+ */
+ WORD32 *pi4_entry_ofst;
+
+ /**
+ * Current offset in pi4_entry_ofst
+ */
+ WORD32 i4_cur_entry_ofst;
+
+ /**
+ * Parsing context
+ */
+ parse_ctxt_t s_parse;
+
+ /**
+ * Processing context - One for each processing thread
+ */
+ process_ctxt_t as_process[MAX_PROCESS_THREADS];
+
+ /**
+ * Thread handle for each of the processing threads
+ */
+ void *apv_process_thread_handle[MAX_PROCESS_THREADS];
+
+ /**
+ * Thread created flag for each of the processing threads
+ */
+ WORD32 ai4_process_thread_created[MAX_PROCESS_THREADS];
+
+ /**
+ * Void pointer to process job context
+ */
+ void *pv_proc_jobq;
+
+ /* Number of CTBs processed together for better instruction cache handling */
+ WORD32 i4_proc_nctb;
+
+ /**
+ * Previous POC lsb
+ */
+ WORD32 i4_prev_poc_lsb;
+
+ /**
+ * Previous POC msb
+ */
+ WORD32 i4_prev_poc_msb;
+
+ /**
+ * Max POC lsb that has arrived till now
+ */
+ WORD32 i4_max_prev_poc_lsb;
+
+ /** Context for format conversion */
+ fmt_conv_t s_fmt_conv;
+
+ /** Pointer to a structure describing output display buffer */
+ ivd_out_bufdesc_t *ps_out_buffer;
+ /**
+ * Variable to store the next ctb count to compute pu idx
+ */
+ WORD32 i4_next_pu_ctb_cnt;
+
+ /**
+ * Variable to store the next ctb count to compute tu idx
+ */
+ WORD32 i4_next_tu_ctb_cnt;
+
+ /** Active SPS id - mainly to be used during codec initializations in shared mode */
+ WORD32 i4_sps_id;
+
+ /** Number of ctbs to be decoded in one process call */
+ UWORD32 u4_nctb;
+
+ /** Flag to enable scheduling of format conversion jobs ahead of processing jobs */
+ UWORD32 u4_enable_fmt_conv_ahead;
+
+ /** Mask used to change MVs to full pel when configured to run in reduced complexity mode */
+ WORD32 i4_mv_frac_mask;
+#ifdef GPU_BUILD
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *apu4_pic_vert_bs[2];
+
+ /**
+ * Horizontal Boundary strength
+ */
+
+ /* Two bits per edge.
+ Stored in format. BS[15] | BS[14] | .. |BS[0]*/
+ UWORD32 *apu4_pic_horz_bs[2];
+
+ /**
+ * Flags to indicate if QP is constant through out a CTB - 1 bit for each CTB
+ * The bits are packed from LSB to MSB
+ * To get the flag corresponding to CTB with (ctb_x, ctb_y), use
+ * pu4_qp_const_in_ctb[(ctb_x + pic_wd_in_ctb * ctb_y) >> 3] & (1 << ((ctb_x + pic_wd_in_ctb * ctb_y) & 7))
+ */
+ UWORD8 *apu1_pic_qp_const_in_ctb[2];
+
+ /**
+ * Qp array stored for each 8x8 pixels
+ */
+ UWORD8 *apu1_pic_qp[2];
+
+ /**
+ * Pointer to frame level sao_t for the current frame being parsed
+ */
+ sao_t *aps_pic_sao[2];
+
+ /* GPU context structure */
+ gpu_ctxt_t s_gpu_ctxt;
+
+ /* Flag to switch bw MC on GPU and CPU dynamically */
+ UWORD32 u4_gpu_enabled;
+
+ /* Variable to store the view(ping or pong) for parsing */
+ UWORD32 u4_parsing_view;
+
+ /*
+ * Set the flag to remember to add the frame for flushing
+ * call is a flush call.
+ */
+ UWORD32 u4_add_last_frame;
+#endif
+ /** Funtion pointers for inter_pred leaf level functions */
+ pf_inter_pred apf_inter_pred[22];
+
+ /** Funtion pointers for inter_pred_luma leaf level functions */
+ pf_intra_pred apf_intra_pred_luma[11];
+
+ /** Funtion pointers for inter_pred_chroma leaf level functions */
+ pf_intra_pred apf_intra_pred_chroma[11];
+
+ /** Funtion pointers for itrans_recon leaf level functions */
+ pf_itrans_recon apf_itrans_recon[8];
+
+ /** Funtion pointers for recon leaf level functions */
+ pf_recon apf_recon[8];
+
+ /** Funtion pointers for itrans_recon_dc leaf level functions */
+ pf_itrans_recon_dc apf_itrans_recon_dc[2];
+
+ /** Funtion pointers for sao_luma leaf level functions */
+ pf_sao_luma apf_sao_luma[4];
+
+ /** Funtion pointers for sao_chroma leaf level functions */
+ pf_sao_chroma apf_sao_chroma[4];
+
+ /** Funtion pointers for all the leaf level functions */
+ func_selector_t s_func_selector;
+ /** Processor architecture */
+ IVD_ARCH_T e_processor_arch;
+ /** Processor soc */
+ IVD_SOC_T e_processor_soc;
+};
+
+#endif /* _IHEVCD_STRUCTS_H_ */
diff --git a/decoder/ihevcd_trace.c b/decoder/ihevcd_trace.c
new file mode 100644
index 0000000..7811bc8
--- /dev/null
+++ b/decoder/ihevcd_trace.c
@@ -0,0 +1,144 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_trace.c
+*
+* @brief
+* Contains trace related functions
+*
+* @author
+* Ittiam
+*
+* @par List of Functions:
+* - ihevcd_trace_init()
+* - ihevcd_trace_deinit()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#ifdef TRACE
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_trace.h"
+
+
+
+/*****************************************************************************/
+/* Declare globals */
+/*****************************************************************************/
+/**
+ * Trace context
+ */
+trace_t g_trace;
+/**
+ * Trace file name
+ */
+CHAR ac_trace_fname[] = "trace.txt";
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Function used for initialization of trace parameters
+*
+* @par Description:
+* Initialize trace structure elements
+*
+* @param[in] pc_fname
+* File name for trace dumps
+*
+* @returns none
+*
+* @remarks
+* Uses global hence not thread safe
+*
+*******************************************************************************
+*/
+
+void ihevcd_trace_init(CHAR *pc_fname)
+{
+ trace_t *ps_trace = &g_trace;
+
+ if(pc_fname == NULL)
+ pc_fname = ac_trace_fname;
+
+ ps_trace->fp = fopen(pc_fname, "w");
+
+ if(NULL == ps_trace->fp)
+ {
+ exit(-1);
+ }
+ return;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Function used for deinitialization of trace parameters
+*
+* @par Description:
+* Initialize trace structure elements
+*
+* @param[in] ps_trace
+* Pointer to trace context
+*
+* @returns none
+*
+* @remarks
+* Uses global hence not thread safe
+*
+*******************************************************************************
+*/
+void ihevcd_trace_deinit(trace_t *ps_trace)
+{
+ if(NULL != ps_trace->fp)
+ {
+ fclose(ps_trace->fp);
+ }
+ return;
+}
+
+#endif /* TRACE */
diff --git a/decoder/ihevcd_trace.h b/decoder/ihevcd_trace.h
new file mode 100644
index 0000000..09aa7d8
--- /dev/null
+++ b/decoder/ihevcd_trace.h
@@ -0,0 +1,175 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_trace.h
+*
+* @brief
+* Header for codec trace messages
+*
+* @author
+* Ittiam
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IHEVCD_TRACE_H_
+#define _IHEVCD_TRACE_H_
+
+#define FULLRANGE 1
+
+
+#define RANGE_NUMBITS 31
+#define RANGE_SHIFT (RANGE_NUMBITS - 9)
+
+#ifdef TRACE
+/**
+ * Context for trace
+ */
+typedef struct
+{
+ /**
+ * fp
+ */
+ FILE *fp;
+
+ /**
+ * u8_cnt
+ */
+ ULWORD64 u8_cnt;
+}trace_t;
+
+/**
+ * Global context for trace info
+ */
+extern trace_t g_trace;
+
+/**
+ * Call ihevcd_bits_get() to read from bitstream and dumps the data to trace file
+ */
+#define BITS_PARSE(m_str, m_value, m_ps_bitstrm, m_numbits) \
+{ \
+ m_value = ihevcd_bits_get(m_ps_bitstrm, m_numbits); \
+ fprintf( g_trace.fp, "%-40s u(%d) : %d\n", m_str, m_numbits, m_value ); \
+ fflush ( g_trace.fp); \
+}
+
+/**
+ * Call ihevcd_uev() to read from bitstream and dumps the data to trace file
+ */
+
+#define UEV_PARSE(m_str, m_value, m_ps_bitstrm) \
+{ \
+ m_value = ihevcd_uev(m_ps_bitstrm); \
+ fprintf( g_trace.fp, "%-40s ue(v) : %d\n", m_str, m_value ); \
+ fflush ( g_trace.fp); \
+}
+/**
+ * Call ihevcd_sev() to read from bitstream and dumps the data to trace file
+ */
+#define SEV_PARSE(m_str, m_value, m_ps_bitstrm) \
+{ \
+ m_value = ihevcd_sev(m_ps_bitstrm); \
+ fprintf( g_trace.fp, "%-40s se(v) : %d\n", m_str, m_value ); \
+ fflush ( g_trace.fp); \
+}
+
+
+#if FULLRANGE
+#define TRACE_CABAC_CTXT(m_string, m_range, m_ctxt_idx) \
+{ \
+ UWORD32 m_clz, m_range_shift, m_state_mps; \
+ m_state_mps = ps_cabac->au1_ctxt_models[m_ctxt_idx]; \
+ m_clz = CLZ(m_range); \
+ m_clz -= (32 - RANGE_NUMBITS); \
+ m_range_shift = m_range << m_clz; \
+ m_range_shift = m_range_shift >> RANGE_SHIFT; \
+ fprintf( g_trace.fp, "%-40s: Range:%3d State:%3d MPS:%1d\n", \
+ m_string, m_range_shift, m_state_mps >> 1, m_state_mps & 1); \
+ fflush ( g_trace.fp); \
+}
+#define AEV_TRACE(m_str, m_value, m_range) \
+{ \
+ UWORD32 m_clz, m_range_shift; \
+ m_clz = CLZ(m_range); \
+ m_clz -= (32 - RANGE_NUMBITS); \
+ m_range_shift = m_range << m_clz; \
+ m_range_shift = m_range_shift >> RANGE_SHIFT; \
+ fprintf( g_trace.fp, "%-40s:%8d R:%d\n", m_str, m_value, m_range_shift);\
+ fflush ( g_trace.fp); \
+}
+#else
+#define TRACE_CABAC_CTXT(m_string, m_range, m_ctxt_idx) \
+{ \
+ UWORD32 m_state_mps; \
+ m_state_mps = ps_cabac->au1_ctxt_models[m_ctxt_idx]; \
+ fprintf( g_trace.fp, "%-40s: Range:%3d State:%3d MPS:%1d\n", \
+ m_string, m_range, m_state_mps >> 1, m_state_mps & 1); \
+ fflush ( g_trace.fp); \
+}
+
+#define AEV_TRACE(m_str, m_value, m_range) \
+{ \
+ fprintf( g_trace.fp, "%-40s:%8d R:%d\n", m_str, m_value, m_range); \
+ fflush ( g_trace.fp); \
+}
+#endif
+
+#define TUV_PARSE(m_str, m_value, m_ps_bitstrm) \
+ m_value = ihevcd_bits_get(m_ps_bitstrm, 1);
+
+#define TRACE_INIT(a) ihevcd_trace_init(a)
+#define TRACE_DEINIT(a) ihevcd_trace_deinit(a)
+
+#else /* TRACE */
+/**
+ * Call ihevcd_bits_get() to read from bitstream
+ */
+
+#define BITS_PARSE(m_str, m_value, m_ps_bitstrm, m_numbits) \
+ m_value = ihevcd_bits_get(m_ps_bitstrm, m_numbits);
+
+/**
+ * Call ihevcd_uev() to read from bitstream
+ */
+
+#define UEV_PARSE(m_str, m_value, m_ps_bitstrm) \
+ m_value = ihevcd_uev(m_ps_bitstrm);
+
+/**
+ * Call ihevcd_sev() to read from bitstream
+ */
+
+#define SEV_PARSE(m_str, m_value, m_ps_bitstrm) \
+ m_value = ihevcd_sev(m_ps_bitstrm);
+
+#define TUV_PARSE(m_str, m_value, m_ps_bitstrm) \
+ m_value = ihevcd_bits_get(m_ps_bitstrm, 1);
+
+#define TRACE_CABAC_CTXT(m_string, m_range, m_state_mps)
+
+#define AEV_TRACE(m_str, m_value, m_range)
+
+
+#define TRACE_INIT(a)
+#define TRACE_DEINIT(a)
+#endif /* TRACE */
+#endif /* _IHEVCD_TRACE_H_ */
diff --git a/decoder/ihevcd_utils.c b/decoder/ihevcd_utils.c
new file mode 100644
index 0000000..36399a7
--- /dev/null
+++ b/decoder/ihevcd_utils.c
@@ -0,0 +1,1318 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_utils.c
+*
+* @brief
+* Contains miscellaneous utility functions such as init() etc
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_defs.h"
+#include "ihevc_error.h"
+#include "ihevc_structs.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+
+#include "ihevc_common_tables.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_cabac_tables.h"
+
+#include "ihevcd_defs.h"
+
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+#include "ihevcd_error.h"
+#include "ihevcd_nal.h"
+#include "ihevcd_bitstream.h"
+#include "ihevcd_utils.h"
+#include "ihevcd_trace.h"
+#include "ihevcd_process_slice.h"
+#include "ihevcd_job_queue.h"
+#ifdef GPU_BUILD
+#include "ihevcd_opencl_mc_interface.h"
+#endif
+#define MAX_DPB_PIC_BUF 6
+
+/* Function declarations */
+mv_buf_t* ihevcd_mv_mgr_get_poc(buf_mgr_t *ps_mv_buf_mgr, UWORD32 abs_poc);
+
+/**
+*******************************************************************************
+*
+* @brief
+* Used to get level index for a given level
+*
+* @par Description:
+* Converts from level_idc (which is multiplied by 30) to an index that can be
+* used as a lookup. Also used to ignore invalid levels like 2.2 , 3.2 etc
+*
+* @param[in] level
+* Level of the stream
+*
+* @returns Level index for a given level
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_lvl_idx(WORD32 level)
+{
+ WORD32 lvl_idx = 0;
+
+ if(level < IHEVC_LEVEL_20)
+ {
+ lvl_idx = 0;
+ }
+ else if(level >= IHEVC_LEVEL_20 && level < IHEVC_LEVEL_21)
+ {
+ lvl_idx = 1;
+ }
+ else if(level >= IHEVC_LEVEL_21 && level < IHEVC_LEVEL_30)
+ {
+ lvl_idx = 2;
+ }
+ else if(level >= IHEVC_LEVEL_30 && level < IHEVC_LEVEL_31)
+ {
+ lvl_idx = 3;
+ }
+ else if(level >= IHEVC_LEVEL_31 && level < IHEVC_LEVEL_40)
+ {
+ lvl_idx = 4;
+ }
+ else if(level >= IHEVC_LEVEL_40 && level < IHEVC_LEVEL_41)
+ {
+ lvl_idx = 5;
+ }
+ else if(level >= IHEVC_LEVEL_41 && level < IHEVC_LEVEL_50)
+ {
+ lvl_idx = 6;
+ }
+ else if(level >= IHEVC_LEVEL_50 && level < IHEVC_LEVEL_51)
+ {
+ lvl_idx = 7;
+ }
+ else if(level >= IHEVC_LEVEL_51 && level < IHEVC_LEVEL_52)
+ {
+ lvl_idx = 8;
+ }
+ else if(level >= IHEVC_LEVEL_52 && level < IHEVC_LEVEL_60)
+ {
+ lvl_idx = 9;
+ }
+ else if(level >= IHEVC_LEVEL_60 && level < IHEVC_LEVEL_61)
+ {
+ lvl_idx = 10;
+ }
+ else if(level >= IHEVC_LEVEL_61 && level < IHEVC_LEVEL_62)
+ {
+ lvl_idx = 11;
+ }
+ else if(level >= IHEVC_LEVEL_62)
+ {
+ lvl_idx = 12;
+ }
+
+ return (lvl_idx);
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Used to get DPB size for a given level, and number of luma samples
+*
+* @par Description:
+* For given width, height and level number of max_dpb_size is computed as per
+* Annex A.4.1
+*
+* @param[in] level
+* Level of the stream
+*
+* @param[in] pic_size
+* Width * Height
+*
+* @returns Number of buffers in DPB
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_dpb_size(WORD32 level, WORD32 pic_size)
+{
+
+ WORD32 max_luma_samples;
+
+ WORD32 max_dpb_size;
+ WORD32 lvl_idx = ihevcd_get_lvl_idx(level);
+ max_luma_samples = gai4_ihevc_max_luma_pic_size[lvl_idx];
+
+
+
+ if(pic_size <= (max_luma_samples >> 2))
+ {
+ max_dpb_size = MIN(4 * MAX_DPB_PIC_BUF, 16);
+ }
+ else if(pic_size <= (max_luma_samples >> 1))
+ {
+ max_dpb_size = MIN(2 * MAX_DPB_PIC_BUF, 16);
+ }
+ else if(pic_size <= ((3 * max_luma_samples) >> 2))
+ {
+ max_dpb_size = MIN((4 * MAX_DPB_PIC_BUF) / 3, 16);
+ }
+ else
+ {
+ max_dpb_size = MAX_DPB_PIC_BUF;
+ }
+
+ return max_dpb_size;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Used to get reference picture buffer size for a given level and
+* and padding used
+*
+* @par Description:
+* Used to get reference picture buffer size for a given level and padding used
+* Each picture is padded on all four sides
+*
+* @param[in] pic_size
+* Mumber of luma samples (Width * Height)
+*
+* @param[in] level
+* Level
+*
+* @param[in] horz_pad
+* Total padding used in horizontal direction
+*
+* @param[in] vert_pad
+* Total padding used in vertical direction
+*
+* @returns Total picture buffer size
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_total_pic_buf_size(WORD32 pic_size,
+ WORD32 level,
+ WORD32 horz_pad,
+ WORD32 vert_pad,
+ WORD32 num_ref_frames,
+ WORD32 num_reorder_frames)
+{
+ WORD32 size;
+ WORD32 num_luma_samples;
+ WORD32 lvl_idx;
+ WORD32 max_wd;
+ WORD32 max_dpb_size;
+ WORD32 num_samples;
+ WORD32 max_num_bufs;
+ WORD32 pad = MAX(horz_pad, vert_pad);
+
+
+ /* Get maximum number of buffers for the current picture size */
+ max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+
+
+ max_num_bufs = (2 * max_dpb_size + 1);
+ /* If num_ref_frames and num_reorder_frmaes is specified
+ * Use minimum value
+ */
+ max_num_bufs = MIN(max_num_bufs, (num_ref_frames + num_reorder_frames + 1));
+
+ /* Get level index */
+ lvl_idx = ihevcd_get_lvl_idx(level);
+
+ /* Maximum number of luma samples in a picture at given level */
+ num_luma_samples = gai4_ihevc_max_luma_pic_size[lvl_idx];
+
+ /* Account for chroma */
+ num_samples = num_luma_samples * 3 / 2;
+
+ /* Maximum width of luma samples in a picture at given level */
+ max_wd = gai4_ihevc_max_wd_ht[lvl_idx];
+
+
+ /* Allocation is required for
+ * (Wd + horz_pad) * (Ht + vert_pad) * (2 * max_dpb_size + 1)
+ *
+ * Above expanded as
+ * ((Wd * Ht) + (horz_pad * vert_pad) + Wd * vert_pad + Ht * horz_pad) * (2 * max_dpb_size + 1)
+ * (Wd * Ht) * (2 * max_dpb_size + 1) + ((horz_pad * vert_pad) + Wd * vert_pad + Ht * horz_pad) * (2 * max_dpb_size + 1)
+ * Now max_dpb_size increases with smaller Wd and Ht, but Wd * ht * max_dpb_size will still be lesser or equal to max_wd * max_ht * dpb_size
+ *
+ * In the above equation (Wd * Ht) * (2 * max_dpb_size + 1) is accounted by using num_samples * (2 * max_dpb_size + 1) below
+ *
+ * For the padded area use MAX(horz_pad, vert_pad) as pad
+ * ((pad * pad) + pad * (Wd + Ht)) * (2 * max_dpb_size + 1) has to accounted from the above for padding
+ *
+ * Since Width and Height can change worst Wd + Ht is when One of the dimensions is max and other is min
+ * So use max_wd and min_ht
+ */
+
+ /* Number of bytes in reference pictures */
+ size = num_samples * max_num_bufs;
+
+ /* Account for padding area */
+ size += ((pad * pad) + pad * (max_wd + max_wd)) * max_num_bufs;
+
+ return size;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Used to get MV bank size for a given number of luma samples
+*
+* @par Description:
+* For given number of luma samples one MV bank size is computed
+* Each MV bank includes pu_map and pu_t for all the min PUs(4x4) in a picture
+*
+* @param[in] num_luma_samples
+* Max number of luma pixels in the frame
+*
+* @returns Total MV Bank size
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_pic_mv_bank_size(WORD32 num_luma_samples)
+{
+ WORD32 size;
+
+ WORD32 pic_size;
+
+ WORD32 mv_bank_size;
+ WORD32 num_pu;
+ WORD32 num_ctb;
+ pic_size = num_luma_samples;
+
+
+ num_pu = pic_size / (MIN_PU_SIZE * MIN_PU_SIZE);
+ num_ctb = pic_size / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+
+ mv_bank_size = 0;
+
+ /* Size for storing pu_t start index each CTB */
+ /* One extra entry is needed to compute number of PUs in the last CTB */
+ mv_bank_size += (num_ctb + 1) * sizeof(WORD32);
+
+ /* Size for pu_map */
+ mv_bank_size += num_pu;
+
+ /* Size for storing pu_t for each PU */
+ mv_bank_size += num_pu * sizeof(pu_t);
+
+
+ size = mv_bank_size;
+ return size;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Used to get TU data size for a given number luma samples
+*
+* @par Description:
+* For a given number of luma samples TU data size is computed
+* Each TU data includes tu_map and tu_t and coeff data for all
+* the min TUs(4x4) in given CTB
+*
+* @param[in] num_luma_samples
+* Number of 64 x 64 CTBs for which TU data has to be allocated.
+*
+* @returns Total TU data size
+*
+* @remarks Assumption is num_luma_samples will be at least
+* 64 x 64 to handle CTB of size 64 x 64. Can be frame size as well
+*
+*******************************************************************************
+*/
+WORD32 ihevcd_get_tu_data_size(WORD32 num_luma_samples)
+{
+
+
+ WORD32 tu_data_size;
+ WORD32 num_ctb;
+ WORD32 num_luma_tu, num_chroma_tu, num_tu;
+ num_ctb = num_luma_samples / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+
+ num_luma_tu = num_luma_samples / (MIN_TU_SIZE * MIN_TU_SIZE);
+ num_chroma_tu = num_luma_tu >> 1;
+
+ num_tu = num_luma_tu + num_chroma_tu;
+ tu_data_size = 0;
+
+ /* Size for storing tu_t start index each CTB */
+ /* One extra entry is needed to compute number of TUs in the last CTB */
+ tu_data_size += (num_ctb + 1) * sizeof(WORD32);
+
+ /* Size for storing tu map */
+ tu_data_size += num_luma_tu * sizeof(UWORD8);
+
+ /* Size for storing tu_t for each TU */
+ tu_data_size += num_tu * sizeof(tu_t);
+
+ /* Size for storing number of coded subblocks and scan_idx for each TU */
+ tu_data_size += num_tu * (sizeof(WORD8) + sizeof(WORD8));
+
+ /* Size for storing coeff data for each TU */
+ tu_data_size += num_tu * sizeof(tu_sblk_coeff_data_t);
+
+
+ return tu_data_size;
+}
+
+
+WORD32 ihevcd_nctb_cnt(codec_t *ps_codec, sps_t *ps_sps)
+{
+ WORD32 nctb = 1;
+ UNUSED(ps_codec);
+ //TODO: Currently set to 1
+ /* If CTB size is less than 32 x 32 then set nCTB as 4 */
+ if(ps_sps->i1_log2_ctb_size < 5)
+ nctb = 1;
+
+ return nctb;
+}
+
+IHEVCD_ERROR_T ihevcd_get_tile_pos(pps_t *ps_pps,
+ sps_t *ps_sps,
+ WORD32 ctb_x,
+ WORD32 ctb_y,
+ WORD32 *pi4_ctb_tile_x,
+ WORD32 *pi4_ctb_tile_y,
+ WORD32 *pi4_tile_idx)
+{
+
+ tile_t *ps_tile_tmp;
+ WORD32 i;
+ WORD32 tile_row, tile_col;
+
+ if(ctb_x < 0 || ctb_y < 0)
+ {
+ *pi4_ctb_tile_x = 0;
+ *pi4_ctb_tile_y = 0;
+ *pi4_tile_idx = 0;
+
+ return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ }
+
+ tile_row = 0;
+ tile_col = 0;
+ ps_tile_tmp = ps_pps->ps_tile;
+ if(0 == ps_pps->i1_tiles_enabled_flag)
+ {
+ *pi4_ctb_tile_x = ctb_x;
+ *pi4_ctb_tile_y = ctb_y;
+ *pi4_tile_idx = 0;
+ }
+ else
+ {
+ for(i = 0; i < ps_pps->i1_num_tile_columns; i++)
+ {
+ WORD16 next_tile_ctb_x;
+ ps_tile_tmp = ps_pps->ps_tile + i; //* ps_pps->i1_num_tile_rows;
+ if((ps_pps->i1_num_tile_columns - 1) == i)
+ {
+ next_tile_ctb_x = ps_sps->i2_pic_wd_in_ctb;
+ }
+ else
+ {
+ tile_t *ps_tile_next_tmp;
+ ps_tile_next_tmp = ps_pps->ps_tile + i + 1;
+ next_tile_ctb_x = ps_tile_next_tmp->u1_pos_x;
+ }
+ if((ctb_x >= ps_tile_tmp->u1_pos_x) && (ctb_x < next_tile_ctb_x))
+ {
+ tile_col = i;
+ break;
+ }
+ }
+ *pi4_ctb_tile_x = ctb_x - ps_tile_tmp->u1_pos_x;
+
+ for(i = 0; i < ps_pps->i1_num_tile_rows; i++)
+ {
+ WORD16 next_tile_ctb_y;
+ ps_tile_tmp = ps_pps->ps_tile + i * ps_pps->i1_num_tile_columns;
+ if((ps_pps->i1_num_tile_rows - 1) == i)
+ {
+ next_tile_ctb_y = ps_sps->i2_pic_ht_in_ctb;
+ }
+ else
+ {
+ tile_t *ps_tile_next_tmp;
+ ps_tile_next_tmp = ps_pps->ps_tile + ((i + 1) * ps_pps->i1_num_tile_columns);
+ next_tile_ctb_y = ps_tile_next_tmp->u1_pos_y;
+ }
+ if((ctb_y >= ps_tile_tmp->u1_pos_y) && (ctb_y < next_tile_ctb_y))
+ {
+ tile_row = i;
+ break;
+ }
+
+ }
+ *pi4_ctb_tile_y = ctb_y - ps_tile_tmp->u1_pos_y;
+ *pi4_tile_idx = tile_row * ps_pps->i1_num_tile_columns
+ + tile_col;
+ }
+ return (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Function to initialize ps_pic_buf structs add pic buffers to
+* buffer manager in case of non-shared mode
+*
+* @par Description:
+* Function to initialize ps_pic_buf structs add pic buffers to
+* buffer manager in case of non-shared mode
+* To be called once per stream or for every reset
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_pic_buf_mgr_add_bufs(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 i;
+ WORD32 max_dpb_size;
+ sps_t *ps_sps;
+ UWORD8 *pu1_buf;
+ pic_buf_t *ps_pic_buf;
+ WORD32 pic_buf_size_allocated;
+
+ WORD32 max_num_bufs;
+ WORD32 pic_size;
+ WORD32 level;
+
+
+ /* Initialize MV Bank buffer manager */
+ ps_sps = ps_codec->s_parse.ps_sps;
+
+ pic_size = ps_sps->i2_pic_width_in_luma_samples *
+ ps_sps->i2_pic_height_in_luma_samples;
+
+
+ /* Compute the number of MB Bank buffers needed */
+ level = ps_codec->i4_init_level;
+ max_dpb_size = ihevcd_get_dpb_size(level, pic_size);
+ /* Allocate twice dpb size to handle worst case reorder without returning more
+ * than one output per call
+ */
+ max_dpb_size *= 2;
+ /* Allocate one extra picture to handle current frame
+ * In case of asynchronous parsing and processing, number of buffers should increase here
+ * based on when parsing and processing threads are synchronized
+ */
+ max_dpb_size++;
+
+ /* If num_ref_frames and num_reorder_frmaes is specified
+ * Use minimum value
+ */
+ max_num_bufs = MIN(max_dpb_size, (ps_codec->i4_init_num_ref + ps_codec->i4_init_num_reorder + 1));
+
+
+ pu1_buf = (UWORD8 *)ps_codec->ps_pic_buf;
+
+ ps_pic_buf = (pic_buf_t *)ps_codec->ps_pic_buf;
+
+ pu1_buf += BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
+
+ /* In case of non-shared mode, add picture buffers to buffer manager
+ * In case of shared mode buffers are added in the run-time
+ */
+ if(0 == ps_codec->i4_share_disp_buf)
+ {
+ WORD32 buf_ret;
+ WORD32 luma_samples;
+ WORD32 chroma_samples;
+ pic_buf_size_allocated = ps_codec->i4_total_pic_buf_size -
+ BUF_MGR_MAX_CNT * sizeof(pic_buf_t);
+
+ luma_samples = (ps_codec->i4_strd) *
+ (ps_sps->i2_pic_height_in_luma_samples + PAD_HT);
+
+ chroma_samples = luma_samples / 2;
+
+ /* Try to add as many buffers as possible since memory is already allocated */
+ /* If the number of buffers that can be added is less than max_num_bufs
+ * return with an error.
+ */
+ for(i = 0; i < (2 * MAX_DPB_SIZE) + 1; i++)
+ {
+ pic_buf_size_allocated -= (luma_samples + chroma_samples);
+
+ if(pic_buf_size_allocated < 0)
+ {
+ if(i < max_num_bufs)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_INSUFFICIENT_MEM_PICBUF;
+ return IHEVCD_INSUFFICIENT_MEM_PICBUF;
+ }
+ break;
+ }
+
+ ps_pic_buf->pu1_luma = pu1_buf + ps_codec->i4_strd * PAD_TOP + PAD_LEFT;
+ pu1_buf += luma_samples;
+
+ ps_pic_buf->pu1_chroma = pu1_buf + ps_codec->i4_strd * (PAD_TOP / 2) + PAD_LEFT;
+ pu1_buf += chroma_samples;
+
+ buf_ret = ihevc_buf_mgr_add((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, ps_pic_buf, i);
+
+ if(0 != buf_ret)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_BUF_MGR_ERROR;
+ return IHEVCD_BUF_MGR_ERROR;
+ }
+ ps_pic_buf++;
+ }
+ }
+
+ return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Function to add buffers to MV Bank buffer manager
+*
+* @par Description:
+* Function to add buffers to MV Bank buffer manager
+* To be called once per stream or for every reset
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_mv_buf_mgr_add_bufs(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ WORD32 i;
+ WORD32 max_dpb_size;
+ WORD32 mv_bank_size_allocated;
+ WORD32 pic_mv_bank_size;
+ WORD32 level;
+ sps_t *ps_sps;
+ UWORD8 *pu1_buf;
+ mv_buf_t *ps_mv_buf;
+
+
+ /* Initialize MV Bank buffer manager */
+ ps_sps = ps_codec->s_parse.ps_sps;
+
+
+ /* Compute the number of MB Bank buffers needed */
+ level = ps_codec->i4_init_level;
+ max_dpb_size = ihevcd_get_dpb_size(level,
+ ps_sps->i2_pic_width_in_luma_samples *
+ ps_sps->i2_pic_height_in_luma_samples);
+
+ /* Allocate one extra MV Bank to handle current frame
+ * In case of asynchronous parsing and processing, number of buffers should increase here
+ * based on when parsing and processing threads are synchronized
+ */
+ max_dpb_size++;
+
+ pu1_buf = (UWORD8 *)ps_codec->pv_mv_bank_buf_base;
+
+ ps_mv_buf = (mv_buf_t *)pu1_buf;
+ pu1_buf += BUF_MGR_MAX_CNT * sizeof(mv_buf_t);
+ ps_codec->ps_mv_buf = ps_mv_buf;
+ mv_bank_size_allocated = ps_codec->i4_total_mv_bank_size - BUF_MGR_MAX_CNT * sizeof(mv_buf_t);
+
+ /* Compute MV bank size per picture */
+ pic_mv_bank_size = ihevcd_get_pic_mv_bank_size(ps_sps->i2_pic_width_in_luma_samples *
+ ps_sps->i2_pic_height_in_luma_samples);
+
+ for(i = 0; i < max_dpb_size; i++)
+ {
+ WORD32 buf_ret;
+ WORD32 num_pu;
+ WORD32 num_ctb;
+ WORD32 pic_size;
+ pic_size = ALIGN64(ps_sps->i2_pic_width_in_luma_samples) *
+ ALIGN64(ps_sps->i2_pic_height_in_luma_samples);
+
+
+ num_pu = pic_size / (MIN_PU_SIZE * MIN_PU_SIZE);
+ num_ctb = pic_size / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+
+
+ mv_bank_size_allocated -= pic_mv_bank_size;
+
+ if(mv_bank_size_allocated < 0)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_INSUFFICIENT_MEM_MVBANK;
+ return IHEVCD_INSUFFICIENT_MEM_MVBANK;
+ }
+
+ ps_mv_buf->pu4_pic_pu_idx = (UWORD32 *)pu1_buf;
+ pu1_buf += (num_ctb + 1) * sizeof(WORD32);
+
+ ps_mv_buf->pu1_pic_pu_map = pu1_buf;
+ pu1_buf += num_pu;
+
+ ps_mv_buf->pu1_pic_slice_map = (UWORD16 *)pu1_buf;
+ pu1_buf += num_ctb * sizeof(UWORD16);
+
+ ps_mv_buf->ps_pic_pu = (pu_t *)pu1_buf;
+
+ buf_ret = ihevc_buf_mgr_add((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, ps_mv_buf, i);
+
+ if(0 != buf_ret)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_BUF_MGR_ERROR;
+ return IHEVCD_BUF_MGR_ERROR;
+ }
+ pu1_buf += pic_mv_bank_size;
+ ps_mv_buf++;
+
+ }
+ return ret;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Picture level initializations required during parsing
+*
+* @par Description:
+* Initialize picture level context variables during parsing Initialize mv
+* bank buffer manager in the first init call
+*
+* @param[in] ps_codec
+* Pointer to codec context
+*
+* @returns Error from IHEVCD_ERROR_T
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IHEVCD_ERROR_T ihevcd_parse_pic_init(codec_t *ps_codec)
+{
+ IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
+ mv_buf_t *ps_mv_buf;
+ sps_t *ps_sps;
+ WORD32 num_min_cu;
+ WORD32 cur_pic_buf_id;
+ WORD32 cur_mv_bank_buf_id;
+ pic_buf_t *ps_cur_pic;
+ slice_header_t *ps_slice_hdr;
+ UWORD8 *pu1_cur_pic_luma, *pu1_cur_pic_chroma;
+ WORD32 i;
+
+ ps_codec->s_parse.i4_error_code = IHEVCD_SUCCESS;
+ ps_sps = ps_codec->s_parse.ps_sps;
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr_base + (ps_codec->s_parse.i4_cur_slice_idx & (MAX_SLICE_HDR_CNT - 1));
+#else
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+#endif
+ /* If parse_pic_init is called, then slice data is present in the input bitstrea stream */
+ ps_codec->i4_pic_present = 1;
+
+ /* Memset picture level intra map and transquant bypass map to zero */
+#ifdef GPU_BUILD
+ ps_codec->s_parse.pu1_pic_intra_flag = ps_codec->apu1_pic_intra_flag[ps_codec->u4_parsing_view];
+ ps_codec->s_parse.pu1_pic_no_loop_filter_flag = ps_codec->apu1_pic_no_loop_filter_flag[ps_codec->u4_parsing_view];
+#endif
+ num_min_cu = ((ps_sps->i2_pic_height_in_luma_samples + 7) / 8) * ((ps_sps->i2_pic_width_in_luma_samples + 63) / 64);
+ memset(ps_codec->s_parse.pu1_pic_intra_flag, 0, num_min_cu);
+ memset(ps_codec->s_parse.pu1_pic_no_loop_filter_flag, 0, num_min_cu);
+
+
+
+ if(0 == ps_codec->s_parse.i4_first_pic_init)
+ {
+ ret = ihevcd_mv_buf_mgr_add_bufs(ps_codec);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ ret = ihevcd_pic_buf_mgr_add_bufs(ps_codec);
+ RETURN_IF((ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS), ret);
+
+ ps_codec->s_parse.i4_first_pic_init = 1;
+ }
+
+ /* Initialize all the slice headers' slice addresses to zero */
+ {
+ WORD32 slice_idx;
+ WORD32 slice_start_idx;
+
+ slice_start_idx = ps_codec->i4_slice_error ? 2 : 1;
+
+ for(slice_idx = slice_start_idx; slice_idx < MAX_SLICE_HDR_CNT; slice_idx++)
+ {
+#ifdef GPU_BUILD
+ slice_header_t *ps_slice_hdr_tmp = ps_codec->aps_slice_hdr_base[0] + slice_idx;
+ ps_slice_hdr_tmp->i2_ctb_x = -1;
+ ps_slice_hdr_tmp->i2_ctb_y = -1;
+ ps_slice_hdr_tmp = ps_codec->aps_slice_hdr_base[1] + slice_idx;
+ ps_slice_hdr_tmp->i2_ctb_x = -1;
+ ps_slice_hdr_tmp->i2_ctb_y = -1;
+#else
+ slice_header_t *ps_slice_hdr_tmp = ps_codec->ps_slice_hdr_base + slice_idx;
+ ps_slice_hdr_tmp->i2_ctb_x = -1;
+ ps_slice_hdr_tmp->i2_ctb_y = -1;
+#endif
+
+ }
+ }
+
+ /* Get free MV Bank to hold current picture's motion vector data */
+ {
+ ps_mv_buf = (mv_buf_t *)ihevc_buf_mgr_get_next_free((buf_mgr_t *)ps_codec->pv_mv_buf_mgr, &cur_mv_bank_buf_id);
+
+ /* If there are no free buffers then return with an error code.
+ * If the buffer is to be freed by another thread , change the
+ * following to call thread yield and wait for buffer to be freed
+ */
+ if(NULL == ps_mv_buf)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_NO_FREE_MVBANK;
+ ps_codec->i4_error_code = IHEVCD_NO_FREE_MVBANK;
+ return IHEVCD_NO_FREE_MVBANK;
+ }
+
+ ps_codec->s_parse.ps_cur_mv_buf = ps_mv_buf;
+ /* Set current ABS poc to ps_mv_buf, so that while freeing a reference buffer
+ * corresponding mv buffer can be found by looping through ps_codec->ps_mv_buf array
+ * and getting a buffer id to free
+ */
+ ps_mv_buf->i4_abs_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+ }
+
+ /* Get free picture buffer to hold current picture recon data */
+ /* TODO: For asynchronous api the following initializations related to picture
+ * buffer should be moved to processing side
+ */
+ {
+
+ UWORD8 *pu1_buf;
+ ps_cur_pic = (pic_buf_t *)ihevc_buf_mgr_get_next_free((buf_mgr_t *)ps_codec->pv_pic_buf_mgr, &cur_pic_buf_id);
+
+ /* If there are no free buffers then return with an error code.
+ * TODO: If the buffer is to be freed by another thread , change the
+ * following to call thread yield and wait for buffer to be freed
+ */
+ if(NULL == ps_cur_pic)
+ {
+ ps_codec->s_parse.i4_error_code = IHEVCD_NO_FREE_PICBUF;
+ ps_codec->i4_error_code = IHEVCD_NO_FREE_PICBUF;
+ return IHEVCD_NO_FREE_PICBUF;
+ }
+
+ /* Store input timestamp sent with input buffer */
+ ps_cur_pic->u4_ts = ps_codec->u4_ts;
+ ps_cur_pic->i4_abs_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+ ps_cur_pic->i4_poc_lsb = ps_slice_hdr->i4_pic_order_cnt_lsb;
+ pu1_buf = ps_cur_pic->pu1_luma;
+ pu1_cur_pic_luma = pu1_buf;
+
+ pu1_buf = ps_cur_pic->pu1_chroma;
+
+ pu1_cur_pic_chroma = pu1_buf;
+ }
+
+ if(0 == ps_codec->u4_pic_cnt)
+ {
+ memset(ps_cur_pic->pu1_luma, 128, (ps_sps->i2_pic_width_in_luma_samples + PAD_WD) * ps_sps->i2_pic_height_in_luma_samples);
+ memset(ps_cur_pic->pu1_chroma, 128, (ps_sps->i2_pic_width_in_luma_samples + PAD_WD) * ps_sps->i2_pic_height_in_luma_samples / 2);
+ }
+
+ /* Fill the remaining entries of the reference lists with the nearest POC
+ * This is done to handle cases where there is a corruption in the reference index */
+ {
+ pic_buf_t *ps_pic_buf_ref;
+ mv_buf_t *ps_mv_buf_ref;
+ WORD32 r_idx;
+ dpb_mgr_t *ps_dpb_mgr = (dpb_mgr_t *)ps_codec->pv_dpb_mgr;
+ buf_mgr_t *ps_mv_buf_mgr = (buf_mgr_t *)ps_codec->pv_mv_buf_mgr;
+
+ ps_pic_buf_ref = ihevc_dpb_mgr_get_ref_by_nearest_poc(ps_dpb_mgr, ps_slice_hdr->i4_abs_pic_order_cnt);
+ if(NULL == ps_pic_buf_ref)
+ {
+ ps_pic_buf_ref = ps_cur_pic;
+ ps_mv_buf_ref = ps_mv_buf;
+ }
+ else
+ {
+ ps_mv_buf_ref = ihevcd_mv_mgr_get_poc(ps_mv_buf_mgr, ps_pic_buf_ref->i4_abs_poc);
+ }
+
+ for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx++)
+ {
+ if(NULL == ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf)
+ {
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+ }
+
+ for(r_idx = ps_slice_hdr->i1_num_ref_idx_l0_active; r_idx < MAX_DPB_SIZE; r_idx++)
+ {
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list0[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+
+ for(r_idx = 0; r_idx < ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx++)
+ {
+ if(NULL == ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf)
+ {
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+ }
+
+ for(r_idx = ps_slice_hdr->i1_num_ref_idx_l1_active; r_idx < MAX_DPB_SIZE; r_idx++)
+ {
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_pic_buf = (void *)ps_pic_buf_ref;
+ ps_slice_hdr->as_ref_pic_list1[r_idx].pv_mv_buf = (void *)ps_mv_buf_ref;
+ }
+ }
+
+
+ /* Reset the jobq to start of the jobq buffer */
+ ihevcd_jobq_reset((jobq_t *)ps_codec->pv_proc_jobq);
+
+ ps_codec->s_parse.i4_pic_pu_idx = 0;
+ ps_codec->s_parse.i4_pic_tu_idx = 0;
+
+ ps_codec->s_parse.pu1_pic_pu_map = ps_mv_buf->pu1_pic_pu_map;
+ ps_codec->s_parse.ps_pic_pu = ps_mv_buf->ps_pic_pu;
+ ps_codec->s_parse.pu4_pic_pu_idx = ps_mv_buf->pu4_pic_pu_idx;
+ ps_codec->s_parse.pu1_slice_idx = (UWORD16 *)ps_mv_buf->pu1_pic_slice_map;
+#ifndef GPU_BUILD
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].pu1_slice_idx = (UWORD16 *)ps_mv_buf->pu1_pic_slice_map;
+ }
+#endif
+ ps_codec->s_parse.pu1_pu_map = ps_codec->s_parse.pu1_pic_pu_map;
+ ps_codec->s_parse.ps_pu = ps_codec->s_parse.ps_pic_pu;
+
+ {
+ UWORD8 *pu1_buf;
+ WORD32 ctb_luma_min_tu_cnt, ctb_chroma_min_tu_cnt, ctb_min_tu_cnt;
+ WORD32 pic_size;
+ WORD32 num_ctb;
+
+ pic_size = ps_sps->i2_pic_width_in_luma_samples *
+ ps_sps->i2_pic_height_in_luma_samples;
+
+ ctb_luma_min_tu_cnt = pic_size / (MIN_TU_SIZE * MIN_TU_SIZE);
+
+ ctb_chroma_min_tu_cnt = ctb_luma_min_tu_cnt >> 1;
+
+ ctb_min_tu_cnt = ctb_luma_min_tu_cnt + ctb_chroma_min_tu_cnt;
+
+ num_ctb = pic_size / (MIN_CTB_SIZE * MIN_CTB_SIZE);
+#ifdef GPU_BUILD
+ pu1_buf = (UWORD8 *)ps_codec->apv_tu_data[ps_codec->u4_parsing_view];
+#else
+ pu1_buf = (UWORD8 *)ps_codec->pv_tu_data;
+#endif
+ ps_codec->s_parse.pu4_pic_tu_idx = (UWORD32 *)pu1_buf;
+ pu1_buf += (num_ctb + 1) * sizeof(WORD32);
+
+ ps_codec->s_parse.pu1_pic_tu_map = pu1_buf;
+ pu1_buf += ctb_min_tu_cnt;
+
+ ps_codec->s_parse.ps_pic_tu = (tu_t *)pu1_buf;
+ pu1_buf += ctb_min_tu_cnt * sizeof(tu_t);
+
+ ps_codec->s_parse.pv_pic_tu_coeff_data = pu1_buf;
+
+ ps_codec->s_parse.pu1_tu_map = ps_codec->s_parse.pu1_pic_tu_map;
+ ps_codec->s_parse.ps_tu = ps_codec->s_parse.ps_pic_tu;
+ ps_codec->s_parse.pv_tu_coeff_data = ps_codec->s_parse.pv_pic_tu_coeff_data;
+ }
+
+ ps_codec->s_parse.s_bs_ctxt.ps_pic_pu = ps_codec->s_parse.ps_pic_pu;
+ ps_codec->s_parse.s_bs_ctxt.pu4_pic_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx;
+ ps_codec->s_parse.s_bs_ctxt.pu4_pic_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx;
+
+
+ /* Set number of CTBs to be processed simultaneously */
+ ps_codec->i4_proc_nctb = ihevcd_nctb_cnt(ps_codec, ps_sps);
+
+ /* Memset Parse Map and process map at the start of frame */
+ //TODO: In case of asynchronous API proc_map can not be set to zero here
+ {
+ WORD32 num_ctb;
+
+ num_ctb = ps_sps->i4_pic_size_in_ctb;
+
+ memset(ps_codec->pu1_parse_map, 0, num_ctb);
+
+#ifndef GPU_BUILD
+ memset(ps_codec->pu1_proc_map, 0, num_ctb);
+#endif
+ }
+
+
+
+ /* Initialize disp buf id to -1, this will be updated at the end of frame if there is
+ * buffer to be displayed
+ */
+ ps_codec->i4_disp_buf_id = -1;
+ ps_codec->ps_disp_buf = NULL;
+
+ ps_codec->i4_disable_deblk_pic = 0;
+ ps_codec->i4_disable_sao_pic = 0;
+ ps_codec->i4_fullpel_inter_pred = 0;
+ ps_codec->i4_mv_frac_mask = 0x7FFFFFFF;
+
+ /* If degrade is enabled, set the degrade flags appropriately */
+ if(ps_codec->i4_degrade_type && ps_codec->i4_degrade_pics)
+ {
+ WORD32 degrade_pic;
+ ps_codec->i4_degrade_pic_cnt++;
+ degrade_pic = 0;
+
+ /* If degrade is to be done in all frames, then do not check further */
+ switch(ps_codec->i4_degrade_pics)
+ {
+ case 4:
+ {
+ degrade_pic = 1;
+ break;
+ }
+ case 3:
+ {
+ if(ps_slice_hdr->i1_slice_type != ISLICE)
+ degrade_pic = 1;
+
+ break;
+ }
+ case 2:
+ {
+
+ /* If pic count hits non-degrade interval or it is an islice, then do not degrade */
+ if((ps_slice_hdr->i1_slice_type != ISLICE) &&
+ (ps_codec->i4_degrade_pic_cnt != ps_codec->i4_nondegrade_interval))
+ degrade_pic = 1;
+
+ break;
+ }
+ case 1:
+ {
+ /* Check if the current picture is non-ref */
+ if((ps_slice_hdr->i1_nal_unit_type < NAL_BLA_W_LP) &&
+ (ps_slice_hdr->i1_nal_unit_type % 2 == 0))
+ {
+ degrade_pic = 1;
+ }
+ break;
+ }
+
+
+ }
+ if(degrade_pic)
+ {
+ if(ps_codec->i4_degrade_type & 0x1)
+ ps_codec->i4_disable_sao_pic = 1;
+
+ if(ps_codec->i4_degrade_type & 0x2)
+ ps_codec->i4_disable_deblk_pic = 1;
+
+ /* MC degrading is done only for non-ref pictures */
+ if((ps_slice_hdr->i1_nal_unit_type < NAL_BLA_W_LP) &&
+ (ps_slice_hdr->i1_nal_unit_type % 2 == 0))
+ {
+ if(ps_codec->i4_degrade_type & 0x4)
+ ps_codec->i4_mv_frac_mask = 0;
+
+ if(ps_codec->i4_degrade_type & 0x8)
+ ps_codec->i4_mv_frac_mask = 0;
+ }
+ }
+ else
+ ps_codec->i4_degrade_pic_cnt = 0;
+ }
+
+
+ {
+ WORD32 i;
+#ifdef GPU_BUILD
+ gpu_ctxt_t *ps_gpu = &ps_codec->s_gpu_ctxt;
+ ps_gpu->i4_curr_grain_ctb_cnt = 0;
+ ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs = ps_codec->apu4_pic_vert_bs[ps_codec->u4_parsing_view];
+ ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs = ps_codec->apu4_pic_horz_bs[ps_codec->u4_parsing_view];
+ ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp = ps_codec->apu1_pic_qp[ps_codec->u4_parsing_view];
+ ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb = ps_codec->apu1_pic_qp_const_in_ctb[ps_codec->u4_parsing_view];
+
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs;
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs;
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp;
+ ps_codec->s_parse.s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb;
+
+ ps_codec->s_parse.s_deblk_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->apu1_pic_no_loop_filter_flag[ps_codec->u4_parsing_view];
+ ps_codec->s_parse.s_sao_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->apu1_pic_no_loop_filter_flag[ps_codec->u4_parsing_view];
+
+ ps_codec->s_parse.s_deblk_ctxt.ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+
+ ps_codec->s_parse.ps_pic_sao = (sao_t *)ps_codec->aps_pic_sao[ps_codec->u4_parsing_view];
+ ps_codec->s_parse.s_sao_ctxt.ps_pic_sao = (sao_t *)ps_codec->aps_pic_sao[ps_codec->u4_parsing_view];
+#endif
+ for(i = 0; i < MAX_PROCESS_THREADS; i++)
+ {
+ ps_codec->as_process[i].pu4_pic_pu_idx = ps_codec->s_parse.pu4_pic_pu_idx;
+ ps_codec->as_process[i].ps_pic_pu = ps_codec->s_parse.ps_pic_pu;
+ ps_codec->as_process[i].pu1_pic_pu_map = ps_codec->s_parse.pu1_pic_pu_map;
+ ps_codec->as_process[i].pu4_pic_tu_idx = ps_codec->s_parse.pu4_pic_tu_idx;
+ ps_codec->as_process[i].ps_pic_tu = ps_codec->s_parse.ps_pic_tu;
+ ps_codec->as_process[i].pu1_pic_tu_map = ps_codec->s_parse.pu1_pic_tu_map;
+ ps_codec->as_process[i].pv_pic_tu_coeff_data = ps_codec->s_parse.pv_pic_tu_coeff_data;
+ ps_codec->as_process[i].i4_cur_mv_bank_buf_id = cur_mv_bank_buf_id;
+ ps_codec->as_process[i].s_sao_ctxt.pu1_slice_idx = ps_codec->as_process[i].pu1_slice_idx;
+ ps_codec->as_process[i].s_sao_ctxt.pu1_tile_idx = ps_codec->as_process[i].pu1_tile_idx;
+
+ /* TODO: For asynchronous api the following initializations related to picture
+ * buffer should be moved to processing side
+ */
+ ps_codec->as_process[i].pu1_cur_pic_luma = pu1_cur_pic_luma;
+ ps_codec->as_process[i].pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+ ps_codec->as_process[i].ps_cur_pic = ps_cur_pic;
+ ps_codec->as_process[i].i4_cur_pic_buf_id = cur_pic_buf_id;
+
+ ps_codec->as_process[i].ps_out_buffer = ps_codec->ps_out_buffer;
+ if(1 < ps_codec->i4_num_cores)
+ {
+ ps_codec->as_process[i].i4_check_parse_status = 1;
+ ps_codec->as_process[i].i4_check_proc_status = 1;
+ }
+ else
+ {
+ ps_codec->as_process[i].i4_check_parse_status = 0;
+ ps_codec->as_process[i].i4_check_proc_status = 0;
+ }
+ ps_codec->as_process[i].pu1_pic_intra_flag = ps_codec->s_parse.pu1_pic_intra_flag;
+ ps_codec->as_process[i].pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+ ps_codec->as_process[i].i4_init_done = 0;
+
+ ps_codec->as_process[i].s_bs_ctxt.pu4_pic_tu_idx = ps_codec->as_process[i].pu4_pic_tu_idx;
+ ps_codec->as_process[i].s_bs_ctxt.pu4_pic_pu_idx = ps_codec->as_process[i].pu4_pic_pu_idx;
+ ps_codec->as_process[i].s_bs_ctxt.ps_pic_pu = ps_codec->as_process[i].ps_pic_pu;
+#ifdef GPU_BUILD
+ ps_codec->as_process[i].u4_gpu_inter_flag = ps_codec->u4_gpu_enabled;
+ ps_codec->as_process[i].s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs;
+ ps_codec->as_process[i].s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs;
+ ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp;
+ ps_codec->as_process[i].s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb;
+
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_vert_bs;
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = (UWORD32 *)ps_codec->s_parse.s_bs_ctxt.pu4_pic_horz_bs;
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp;
+ ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp_const_in_ctb = (UWORD8 *)ps_codec->s_parse.s_bs_ctxt.pu1_pic_qp_const_in_ctb;
+ ps_codec->as_process[i].pu1_proc_map = ps_codec->apu1_proc_map[ps_codec->u4_parsing_view];
+
+ ps_codec->as_process[i].pu1_slice_idx = (UWORD16 *)ps_mv_buf->pu1_pic_slice_map;
+
+#else
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_codec->as_process[i].pu1_proc_map = ps_codec->pu1_proc_map;
+#endif
+#endif
+ ps_codec->as_process[i].s_deblk_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+ ps_codec->as_process[i].s_deblk_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+ ps_codec->as_process[i].s_deblk_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_codec->as_process[i].s_deblk_ctxt.ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+#endif
+ ps_codec->as_process[i].s_sao_ctxt.pu1_pic_no_loop_filter_flag = ps_codec->s_parse.pu1_pic_no_loop_filter_flag;
+ ps_codec->as_process[i].s_sao_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+ ps_codec->as_process[i].s_sao_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ ps_codec->as_process[i].s_sao_ctxt.ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+ ps_codec->as_process[i].ps_slice_hdr_base = ps_codec->s_parse.ps_slice_hdr_base;
+
+ ps_codec->as_process[i].s_sao_ctxt.ps_pic_sao = ps_codec->s_parse.ps_pic_sao;
+#endif
+ if(i < (ps_codec->i4_num_cores - 1))
+ {
+ ithread_create(ps_codec->apv_process_thread_handle[i], NULL,
+ (void *)ihevcd_process_thread,
+ (void *)&ps_codec->as_process[i]);
+ ps_codec->ai4_process_thread_created[i] = 1;
+ }
+ else
+ {
+ ps_codec->ai4_process_thread_created[i] = 0;
+ }
+
+ }
+#ifdef GPU_BUILD
+ memset(ps_codec->apu1_proc_map[ps_codec->u4_parsing_view], 0, ps_sps->i4_pic_size_in_ctb);
+#else
+#ifdef GPU_BUILD
+ //TODO GPU : Later define it for ARM only version as well
+ // and remove from above.
+ memset(ps_codec->pu1_proc_map, 0, ps_sps->i4_pic_size_in_ctb);
+#endif
+#endif
+ ps_codec->s_parse.s_deblk_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+ ps_codec->s_parse.s_deblk_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+
+ ps_codec->s_parse.s_sao_ctxt.pu1_cur_pic_luma = pu1_cur_pic_luma;
+ ps_codec->s_parse.s_sao_ctxt.pu1_cur_pic_chroma = pu1_cur_pic_chroma;
+ }
+ /* Since any input bitstream buffer that contains slice data will be sent to output(even in
+ * case of error, this buffer is added to display queue and next buffer in the display queue
+ * will be returned as the display buffer.
+ * Note: If format conversion (or frame copy) is used and is scheduled
+ * in a different thread then it has to check if the processing for the current row is complete before
+ * it copies/converts a given row. In case of low delay or in case of B pictures, current frame being decoded has to be
+ * returned, which requires a status check to ensure that the current row is reconstructed before copying.
+ */
+ /* Add current picture to display manager */
+#ifndef GPU_BUILD
+ {
+ WORD32 abs_poc;
+ slice_header_t *ps_slice_hdr;
+ ps_slice_hdr = ps_codec->s_parse.ps_slice_hdr;
+ abs_poc = ps_slice_hdr->i4_abs_pic_order_cnt;
+ ihevc_disp_mgr_add((disp_mgr_t *)ps_codec->pv_disp_buf_mgr,
+ ps_codec->as_process[0].i4_cur_pic_buf_id,
+ abs_poc,
+ ps_codec->as_process[0].ps_cur_pic);
+ }
+#endif
+ ps_codec->ps_disp_buf = NULL;
+ /* Get picture to be displayed if number of pictures decoded is more than max allowed reorder */
+ /* Since the current will be decoded, check is fore >= instead of > */
+#ifdef GPU_BUILD
+ //TODO OPENCL delay this by one frame
+ //TODO GPU : Should it be just +1
+ if(((WORD32)(ps_codec->u4_pic_cnt - ps_codec->u4_disp_cnt) >= (ps_sps->ai1_sps_max_num_reorder_pics[ps_sps->i1_sps_max_sub_layers - 1]+2)) ||
+#else
+ if(((WORD32)(ps_codec->u4_pic_cnt - ps_codec->u4_disp_cnt) >= ps_sps->ai1_sps_max_num_reorder_pics[ps_sps->i1_sps_max_sub_layers - 1]) ||
+#endif
+ ((WORD32)(ps_codec->u4_pic_cnt - ps_codec->u4_disp_cnt) >= ps_codec->i4_init_num_reorder))
+
+ {
+ ps_codec->ps_disp_buf = (pic_buf_t *)ihevc_disp_mgr_get((disp_mgr_t *)ps_codec->pv_disp_buf_mgr, &ps_codec->i4_disp_buf_id);
+ ps_codec->u4_disp_cnt++;
+ }
+
+ ps_codec->s_fmt_conv.i4_cur_row = 0;
+ /* Set number of rows to be processed at a time */
+ ps_codec->s_fmt_conv.i4_num_rows = 4;
+
+ if(ps_codec->u4_enable_fmt_conv_ahead && (ps_codec->i4_num_cores > 1))
+ {
+ process_ctxt_t *ps_proc;
+
+ /* i4_num_cores - 1 contexts are currently being used by other threads */
+ ps_proc = &ps_codec->as_process[ps_codec->i4_num_cores - 1];
+
+ /* If the frame being decoded and displayed are different, schedule format conversion jobs
+ * this will keep the proc threads busy and lets parse thread decode few CTBs ahead
+ * If the frame being decoded and displayed are same, then format conversion is scheduled later.
+ */
+ if((ps_codec->ps_disp_buf) && (ps_codec->i4_disp_buf_id != ps_proc->i4_cur_pic_buf_id) &&
+ ((0 == ps_codec->i4_share_disp_buf) || (IV_YUV_420P == ps_codec->e_chroma_fmt)))
+ {
+
+ for(i = 0; i < ps_sps->i2_pic_ht_in_ctb; i++)
+ {
+ proc_job_t s_job;
+ IHEVCD_ERROR_T ret;
+ s_job.i4_cmd = CMD_FMTCONV;
+ s_job.i2_ctb_cnt = 0;
+ s_job.i2_ctb_x = 0;
+ s_job.i2_ctb_y = i;
+ s_job.i2_slice_idx = 0;
+ s_job.i4_tu_coeff_data_ofst = 0;
+ ret = ihevcd_jobq_queue((jobq_t *)ps_codec->s_parse.pv_proc_jobq,
+ &s_job, sizeof(proc_job_t), 1);
+ if(ret != (IHEVCD_ERROR_T)IHEVCD_SUCCESS)
+ return ret;
+ }
+ }
+ }
+
+#ifdef GPU_BUILD
+ /* Pic init for Opencl device */
+ ihevcd_gpu_mc_pic_init(ps_codec);
+#endif
+
+ return ret;
+}
+
+
diff --git a/decoder/ihevcd_utils.h b/decoder/ihevcd_utils.h
new file mode 100644
index 0000000..c2cbcc4
--- /dev/null
+++ b/decoder/ihevcd_utils.h
@@ -0,0 +1,60 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_parse_slice.h
+*
+* @brief
+* Contains miscellaneous utility functions such as init() etc
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IHEVCD_UTILS_H_
+#define _IHEVCD_UTILS_H_
+
+WORD32 ihevcd_get_lvl_idx(WORD32 level);
+WORD32 ihevcd_get_dpb_size(WORD32 level, WORD32 pic_size);
+WORD32 ihevcd_get_pic_mv_bank_size(WORD32 num_luma_samples);
+WORD32 ihevcd_get_tu_data_size(WORD32 num_luma_samples);
+WORD32 ihevcd_nctb_cnt(codec_t *ps_codec, sps_t *ps_sps);
+WORD32 ihevcd_get_max_luma_samples(WORD32 level);
+IHEVCD_ERROR_T ihevcd_get_tile_pos(pps_t *ps_pps,
+ sps_t *ps_sps,
+ WORD32 ctb_x,
+ WORD32 ctb_y,
+ WORD32 *pi4_ctb_tile_x,
+ WORD32 *pi4_ctb_tile_y,
+ WORD32 *pi4_tile_idx);
+IHEVCD_ERROR_T ihevcd_parse_pic_init(codec_t *ps_codec);
+WORD32 ihevcd_get_total_pic_buf_size(WORD32 pic_size,
+ WORD32 level,
+ WORD32 horz_pad,
+ WORD32 vert_pad,
+ WORD32 num_ref_frames,
+ WORD32 num_reorder_frames);
+#endif /* _IHEVCD_UTILS_H_ */
diff --git a/decoder/ihevcd_version.c b/decoder/ihevcd_version.c
new file mode 100644
index 0000000..a47c6fc
--- /dev/null
+++ b/decoder/ihevcd_version.c
@@ -0,0 +1,131 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_version.c
+*
+* @brief
+* Contains version info for HEVC decoder
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+* - ihevcd_get_version()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+/**
+ * Name of the codec
+ */
+#define CODEC_NAME "HEVCDEC"
+/**
+ * Codec release type, production or evaluation
+ */
+#define CODEC_RELEASE_TYPE "production"
+/**
+ * Version string. First two digits signify major version and last two minor
+ * Increment major version for API change or major feature update
+ */
+#define CODEC_RELEASE_VER "04.01"
+/**
+ * Vendor name
+ */
+#define CODEC_VENDOR "ITTIAM"
+
+/**
+*******************************************************************************
+* Concatenates various strings to form a version string
+*******************************************************************************
+*/
+#define VERSION(version_string, codec_name, codec_release_type, codec_release_ver, codec_vendor) \
+ strcpy(version_string,"@(#)Id:"); \
+ strcat(version_string,codec_name); \
+ strcat(version_string,"_"); \
+ strcat(version_string,codec_release_type); \
+ strcat(version_string," Ver:"); \
+ strcat(version_string,codec_release_ver); \
+ strcat(version_string," Released by "); \
+ strcat(version_string,codec_vendor); \
+ strcat(version_string," Build: "); \
+ strcat(version_string,__DATE__); \
+ strcat(version_string," @ "); \
+ strcat(version_string,__TIME__);
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Fills the version info in the given string
+*
+* @par Description:
+*
+*
+* @param[in] pc_version_string
+* Pointer to hold version info
+*
+* @param[in] u4_version_buffer_size
+* Size of the buffer passed
+*
+* @returns Status
+*
+* @remarks
+*
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T ihevcd_get_version(CHAR *pc_version_string,
+ UWORD32 u4_version_buffer_size)
+{
+ CHAR ac_version_tmp[512];
+ VERSION(ac_version_tmp, CODEC_NAME, CODEC_RELEASE_TYPE, CODEC_RELEASE_VER, CODEC_VENDOR);
+
+ if(u4_version_buffer_size >= (strlen(ac_version_tmp) + 1))
+ {
+ memcpy(pc_version_string, ac_version_tmp, (strlen(ac_version_tmp) + 1));
+ return IV_SUCCESS;
+ }
+ else
+ {
+ return IV_FAIL;
+ }
+
+}
+
+
diff --git a/decoder/mips/ihevcd_function_selector.c b/decoder/mips/ihevcd_function_selector.c
new file mode 100644
index 0000000..da734d7
--- /dev/null
+++ b/decoder/mips/ihevcd_function_selector.c
@@ -0,0 +1,85 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector.c
+*
+* @brief
+* Contains functions to initialize function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_mips_generic(codec_t *ps_codec);
+void ihevcd_init_function_ptr_mips_32(codec_t *ps_codec);
+
+void ihevcd_init_function_ptr(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+ switch(ps_codec->e_processor_arch)
+ {
+#if ENABLE_MIPS32_SIMD
+ case ARCH_MIPS_32:
+ ihevcd_init_function_ptr_mips_32(ps_codec);
+ break;
+#endif
+ case ARCH_MIPS_GENERIC:
+ default:
+ ihevcd_init_function_ptr_mips_generic(ps_codec);
+ break;
+ }
+}
+
+void ihevcd_init_arch(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+ ps_codec->e_processor_arch = ARCH_MIPS_32;
+}
diff --git a/decoder/mips/ihevcd_function_selector_mips_generic.c b/decoder/mips/ihevcd_function_selector_mips_generic.c
new file mode 100644
index 0000000..88c56f4
--- /dev/null
+++ b/decoder/mips/ihevcd_function_selector_mips_generic.c
@@ -0,0 +1,160 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector_noneon.c
+*
+* @brief
+* Contains functions to initialize noneon function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_mips_generic(codec_t *ps_codec)
+{
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr = &ihevc_deblk_chroma_horz;
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr = &ihevc_deblk_chroma_vert;
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr = &ihevc_deblk_luma_vert;
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr = &ihevc_deblk_luma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr = &ihevc_inter_pred_chroma_copy;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr = &ihevc_inter_pred_chroma_copy_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr = &ihevc_inter_pred_chroma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr = &ihevc_inter_pred_chroma_horz_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr = &ihevc_inter_pred_chroma_vert;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr = &ihevc_inter_pred_chroma_vert_w16inp;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16inp_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr = &ihevc_inter_pred_luma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr = &ihevc_inter_pred_luma_vert;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr = &ihevc_inter_pred_luma_vert_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr = &ihevc_inter_pred_luma_vert_w16inp;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr = &ihevc_inter_pred_luma_copy;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr = &ihevc_inter_pred_luma_copy_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr = &ihevc_inter_pred_luma_horz_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_luma_vert_w16inp_w16out;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr = &ihevc_intra_pred_chroma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr = &ihevc_intra_pred_luma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr = &ihevc_intra_pred_luma_ref_subst_all_avlble;
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr = &ihevc_intra_pred_ref_filtering;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr = &ihevc_intra_pred_chroma_dc;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr = &ihevc_intra_pred_chroma_horz;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr = &ihevc_intra_pred_chroma_mode2;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr = &ihevc_intra_pred_chroma_mode_18_34;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr = &ihevc_intra_pred_chroma_mode_27_to_33;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr = &ihevc_intra_pred_chroma_mode_3_to_9;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr = &ihevc_intra_pred_chroma_planar;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr = &ihevc_intra_pred_chroma_ver;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr = &ihevc_intra_pred_chroma_mode_11_to_17;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr = &ihevc_intra_pred_chroma_mode_19_to_25;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr = &ihevc_intra_pred_luma_mode_11_to_17;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr = &ihevc_intra_pred_luma_mode_19_to_25;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr = &ihevc_intra_pred_luma_dc;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr = &ihevc_intra_pred_luma_horz;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr = &ihevc_intra_pred_luma_mode2;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr = &ihevc_intra_pred_luma_mode_18_34;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr = &ihevc_intra_pred_luma_mode_27_to_33;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr = &ihevc_intra_pred_luma_mode_3_to_9;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr = &ihevc_intra_pred_luma_planar;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr = &ihevc_intra_pred_luma_ver;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr = &ihevc_itrans_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_fptr = &ihevc_itrans_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_8x8_fptr = &ihevc_itrans_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_16x16_fptr = &ihevc_itrans_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_32x32_fptr = &ihevc_itrans_32x32;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr = &ihevc_itrans_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr = &ihevc_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr = &ihevc_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr = &ihevc_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr = &ihevc_itrans_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr = &ihevc_chroma_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr = &ihevc_chroma_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr = &ihevc_chroma_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr = &ihevc_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_recon_4x4_fptr = &ihevc_recon_4x4;
+ ps_codec->s_func_selector.ihevc_recon_8x8_fptr = &ihevc_recon_8x8;
+ ps_codec->s_func_selector.ihevc_recon_16x16_fptr = &ihevc_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_32x32_fptr = &ihevc_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr = &ihevc_chroma_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr = &ihevc_chroma_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr = &ihevc_chroma_recon_16x16;
+ ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr = &ihevc_memcpy_mul_8;
+ ps_codec->s_func_selector.ihevc_memcpy_fptr = &ihevc_memcpy;
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr = &ihevc_memset_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_fptr = &ihevc_memset;
+ ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr = &ihevc_memset_16bit_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_16bit_fptr = &ihevc_memset_16bit;
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr = &ihevc_pad_left_luma;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr = &ihevc_pad_left_chroma;
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr = &ihevc_pad_right_luma;
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr = &ihevc_pad_right_chroma;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr = &ihevc_weighted_pred_bi;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr = &ihevc_weighted_pred_bi_default;
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr = &ihevc_weighted_pred_uni;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr = &ihevc_weighted_pred_chroma_bi;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr = &ihevc_weighted_pred_chroma_bi_default;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr = &ihevc_weighted_pred_chroma_uni;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr = &ihevc_sao_band_offset_luma;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr = &ihevc_sao_band_offset_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr = &ihevc_sao_edge_offset_class0;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr = &ihevc_sao_edge_offset_class0_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr = &ihevc_sao_edge_offset_class1;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr = &ihevc_sao_edge_offset_class1_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr = &ihevc_sao_edge_offset_class2;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr = &ihevc_sao_edge_offset_class2_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr = &ihevc_sao_edge_offset_class3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr = &ihevc_sao_edge_offset_class3_chroma;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr = &ihevcd_fmt_conv_420sp_to_rgba8888;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr = &ihevcd_fmt_conv_420sp_to_rgb565;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr = &ihevcd_fmt_conv_420sp_to_420p;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr = &ihevcd_itrans_recon_dc_luma;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr = &ihevcd_itrans_recon_dc_chroma;
+}
diff --git a/decoder/x86/ihevcd_fmt_conv_ssse3_intr.c b/decoder/x86/ihevcd_fmt_conv_ssse3_intr.c
new file mode 100644
index 0000000..f963e66
--- /dev/null
+++ b/decoder/x86/ihevcd_fmt_conv_ssse3_intr.c
@@ -0,0 +1,270 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_frm_cvt_x86_intr.c
+*
+* @brief
+* Platform specific intrinsic implementation of certain functions
+*
+* @author
+* Ittiam
+* @par List of Functions:
+* - ihevcd_itrans_recon_dc
+* - ihevcd_fmt_conv_420sp_to_420p
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include "string.h"
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevcd_function_selector.h"
+#include <string.h>
+#include <immintrin.h>
+
+
+void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src,
+ UWORD8 *pu1_uv_src,
+ UWORD8 *pu1_y_dst,
+ UWORD8 *pu1_u_dst,
+ UWORD8 *pu1_v_dst,
+ WORD32 wd,
+ WORD32 ht,
+ WORD32 src_y_strd,
+ WORD32 src_uv_strd,
+ WORD32 dst_y_strd,
+ WORD32 dst_uv_strd,
+ WORD32 is_u_first,
+ WORD32 disable_luma_copy)
+{
+ UWORD8 *pu1_src, *pu1_dst;
+ UWORD8 *pu1_u_src, *pu1_v_src;
+ WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows;
+ WORD32 i, j;
+
+ cols = 0;
+ pu1_u_src = (UWORD8 *)pu1_uv_src;
+ pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
+ if(0 == disable_luma_copy)
+ {
+ /* copy luma */
+ pu1_src = (UWORD8 *)pu1_y_src;
+ pu1_dst = (UWORD8 *)pu1_y_dst;
+
+ num_rows = ht;
+ num_cols = wd;
+
+ src_strd = src_y_strd;
+ dst_strd = dst_y_strd;
+ for(i = 0; i < num_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, num_cols);
+ pu1_dst += dst_strd;
+ pu1_src += src_strd;
+ }
+ }
+
+ /* de-interleave U and V and copy to destination */
+ if(!is_u_first)
+ {
+ UWORD8 *temp = pu1_u_dst;
+ pu1_u_dst = pu1_v_dst;
+ pu1_v_dst = temp;
+
+ pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
+ pu1_v_src = (UWORD8 *)pu1_uv_src;
+ }
+
+ {
+ __m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b;
+ __m128i temp0_8x16b, temp1_8x16b, alt_first_mask;
+
+ UWORD8 FIRST_ALT_SHUFFLE[16] = {
+ 0x00, 0x02, 0x04, 0x06,
+ 0x08, 0x0A, 0x0C, 0x0E,
+ 0x01, 0x03, 0x05, 0x07,
+ 0x09, 0x0B, 0x0D, 0x0F };
+
+ PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0)
+
+ num_rows = ht >> 1;
+ num_cols = wd >> 1;
+
+ src_strd = src_uv_strd;
+ dst_strd = dst_uv_strd;
+
+ alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]);
+
+ if(num_cols > 15)
+ {
+ cols = num_cols >> 4;
+
+ for(i = 0; i < (num_rows >> 2); i++)
+ {
+ UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
+
+ PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0)
+ PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0)
+
+ pu1_uv_src_temp = pu1_uv_src;
+ pu1_u_dst_temp = pu1_u_dst;
+ pu1_v_dst_temp = pu1_v_dst;
+
+ for(j = 0; j < cols; j++)
+ {
+
+ /**** Row 0 ***/
+ src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
+ src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
+
+ temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+ temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+ src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+ src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
+ _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
+
+ /**** Row 1 ***/
+ src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd)));
+ src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16));
+
+ temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+ temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+ src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+ src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b);
+ _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b);
+
+ /**** Row 2 ***/
+ src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd)));
+ src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16));
+
+ temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+ temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+ src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+ src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b);
+ _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b);
+
+ /**** Row 3 ***/
+ src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd)));
+ src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16));
+
+ temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+ temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+ src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+ src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b);
+ _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b);
+
+ pu1_u_dst_temp += 16;
+ pu1_v_dst_temp += 16;
+ pu1_uv_src_temp += 32;
+ }
+
+ pu1_u_dst += 4 * dst_strd;
+ pu1_v_dst += 4 * dst_strd;
+ pu1_uv_src += 4 * src_strd;
+ //pu1_v_src += src_strd;
+ }
+ rows = num_rows & 0x3;
+ if(rows)
+ {
+ for(i = 0; i < rows; i++)
+ {
+ UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
+
+ pu1_uv_src_temp = pu1_uv_src;
+ pu1_u_dst_temp = pu1_u_dst;
+ pu1_v_dst_temp = pu1_v_dst;
+
+ for(j = 0; j < cols; j++)
+ {
+
+ src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
+ src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
+
+ temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
+ temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
+
+ src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
+ src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
+
+ _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
+ _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
+
+ pu1_u_dst_temp += 16;
+ pu1_v_dst_temp += 16;
+ pu1_uv_src_temp += 32;
+ }
+
+ pu1_u_dst += dst_strd;
+ pu1_v_dst += dst_strd;
+ pu1_uv_src += src_strd;
+ }
+ }
+ pu1_u_dst -= (num_rows * dst_strd);
+ pu1_v_dst -= (num_rows * dst_strd);
+ num_cols &= 0x0F;
+ }
+ if(num_cols)
+ {
+ pu1_u_dst += (cols << 4);
+ pu1_v_dst += (cols << 4);
+ pu1_u_src += 2 * (cols << 4);
+ pu1_v_src += 2 * (cols << 4);
+ for(i = 0; i < num_rows; i++)
+ {
+ for(j = 0; j < num_cols; j++)
+ {
+ pu1_u_dst[j] = pu1_u_src[j * 2];
+ pu1_v_dst[j] = pu1_v_src[j * 2];
+ }
+
+ pu1_u_dst += dst_strd;
+ pu1_v_dst += dst_strd;
+ pu1_u_src += src_strd;
+ pu1_v_src += src_strd;
+ }
+ }
+ }
+ return;
+}
diff --git a/decoder/x86/ihevcd_function_selector.c b/decoder/x86/ihevcd_function_selector.c
new file mode 100644
index 0000000..b058a62
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector.c
@@ -0,0 +1,105 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector.c
+*
+* @brief
+* Contains functions to initialize function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+ switch(ps_codec->e_processor_arch)
+ {
+ case ARCH_X86_GENERIC:
+ ihevcd_init_function_ptr_generic(pv_codec);
+ break;
+ case ARCH_X86_SSSE3:
+ ihevcd_init_function_ptr_ssse3(pv_codec);
+ break;
+ case ARCH_X86_SSE42:
+ ihevcd_init_function_ptr_sse42(pv_codec);
+ break;
+ case ARCH_X86_AVX2:
+#ifndef DISABLE_AVX2
+ ihevcd_init_function_ptr_avx2(pv_codec);
+#else
+ ihevcd_init_function_ptr_sse42(pv_codec);
+#endif
+ break;
+ default:
+ ihevcd_init_function_ptr_ssse3(pv_codec);
+ break;
+ }
+}
+
+void ihevcd_init_arch(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+
+#ifdef DEFAULT_ARCH
+#if DEFAULT_ARCH == D_ARCH_X86_GENERIC
+ ps_codec->e_processor_arch = ARCH_X86_GENERIC;
+#elif DEFAULT_ARCH == D_ARCH_X86_SSE42
+ ps_codec->e_processor_arch = ARCH_X86_SSE42;
+#elif DEFAULT_ARCH == D_ARCH_X86_AVX2
+ ps_codec->e_processor_arch = ARCH_X86_AVX2;
+#else
+ ps_codec->e_processor_arch = ARCH_X86_SSSE3;
+#endif
+#else
+ ps_codec->e_processor_arch = ARCH_X86_SSSE3;
+#endif
+}
diff --git a/decoder/x86/ihevcd_function_selector_generic.c b/decoder/x86/ihevcd_function_selector_generic.c
new file mode 100644
index 0000000..f8b53ad
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector_generic.c
@@ -0,0 +1,162 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector.c
+*
+* @brief
+* Contains functions to initialize function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_generic(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr = &ihevc_deblk_chroma_horz;
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr = &ihevc_deblk_chroma_vert;
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr = &ihevc_deblk_luma_vert;
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr = &ihevc_deblk_luma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr = &ihevc_inter_pred_chroma_copy;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr = &ihevc_inter_pred_chroma_copy_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr = &ihevc_inter_pred_chroma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr = &ihevc_inter_pred_chroma_horz_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr = &ihevc_inter_pred_chroma_vert;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr = &ihevc_inter_pred_chroma_vert_w16inp;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16inp_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr = &ihevc_inter_pred_luma_horz;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr = &ihevc_inter_pred_luma_vert;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr = &ihevc_inter_pred_luma_vert_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr = &ihevc_inter_pred_luma_vert_w16inp;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr = &ihevc_inter_pred_luma_copy;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr = &ihevc_inter_pred_luma_copy_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr = &ihevc_inter_pred_luma_horz_w16out;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_luma_vert_w16inp_w16out;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr = &ihevc_intra_pred_chroma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr = &ihevc_intra_pred_luma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr = &ihevc_intra_pred_luma_ref_subst_all_avlble;
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr = &ihevc_intra_pred_ref_filtering;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr = &ihevc_intra_pred_chroma_dc;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr = &ihevc_intra_pred_chroma_horz;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr = &ihevc_intra_pred_chroma_mode2;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr = &ihevc_intra_pred_chroma_mode_18_34;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr = &ihevc_intra_pred_chroma_mode_27_to_33;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr = &ihevc_intra_pred_chroma_mode_3_to_9;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr = &ihevc_intra_pred_chroma_planar;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr = &ihevc_intra_pred_chroma_ver;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr = &ihevc_intra_pred_chroma_mode_11_to_17;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr = &ihevc_intra_pred_chroma_mode_19_to_25;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr = &ihevc_intra_pred_luma_mode_11_to_17;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr = &ihevc_intra_pred_luma_mode_19_to_25;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr = &ihevc_intra_pred_luma_dc;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr = &ihevc_intra_pred_luma_horz;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr = &ihevc_intra_pred_luma_mode2;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr = &ihevc_intra_pred_luma_mode_18_34;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr = &ihevc_intra_pred_luma_mode_27_to_33;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr = &ihevc_intra_pred_luma_mode_3_to_9;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr = &ihevc_intra_pred_luma_planar;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr = &ihevc_intra_pred_luma_ver;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr = &ihevc_itrans_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_fptr = &ihevc_itrans_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_8x8_fptr = &ihevc_itrans_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_16x16_fptr = &ihevc_itrans_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_32x32_fptr = &ihevc_itrans_32x32;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr = &ihevc_itrans_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr = &ihevc_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr = &ihevc_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr = &ihevc_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr = &ihevc_itrans_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr = &ihevc_chroma_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr = &ihevc_chroma_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr = &ihevc_chroma_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr = &ihevc_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_recon_4x4_fptr = &ihevc_recon_4x4;
+ ps_codec->s_func_selector.ihevc_recon_8x8_fptr = &ihevc_recon_8x8;
+ ps_codec->s_func_selector.ihevc_recon_16x16_fptr = &ihevc_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_32x32_fptr = &ihevc_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr = &ihevc_chroma_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr = &ihevc_chroma_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr = &ihevc_chroma_recon_16x16;
+ ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr = &ihevc_memcpy_mul_8;
+ ps_codec->s_func_selector.ihevc_memcpy_fptr = &ihevc_memcpy;
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr = &ihevc_memset_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_fptr = &ihevc_memset;
+ ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr = &ihevc_memset_16bit_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_16bit_fptr = &ihevc_memset_16bit;
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr = &ihevc_pad_left_luma;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr = &ihevc_pad_left_chroma;
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr = &ihevc_pad_right_luma;
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr = &ihevc_pad_right_chroma;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr = &ihevc_weighted_pred_bi;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr = &ihevc_weighted_pred_bi_default;
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr = &ihevc_weighted_pred_uni;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr = &ihevc_weighted_pred_chroma_bi;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr = &ihevc_weighted_pred_chroma_bi_default;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr = &ihevc_weighted_pred_chroma_uni;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr = &ihevc_sao_band_offset_luma;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr = &ihevc_sao_band_offset_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr = &ihevc_sao_edge_offset_class0;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr = &ihevc_sao_edge_offset_class0_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr = &ihevc_sao_edge_offset_class1;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr = &ihevc_sao_edge_offset_class1_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr = &ihevc_sao_edge_offset_class2;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr = &ihevc_sao_edge_offset_class2_chroma;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr = &ihevc_sao_edge_offset_class3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr = &ihevc_sao_edge_offset_class3_chroma;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr = &ihevcd_fmt_conv_420sp_to_rgba8888;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr = &ihevcd_fmt_conv_420sp_to_rgb565;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr = &ihevcd_fmt_conv_420sp_to_420p;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr = &ihevcd_itrans_recon_dc_luma;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr = &ihevcd_itrans_recon_dc_chroma;
+}
diff --git a/decoder/x86/ihevcd_function_selector_sse42.c b/decoder/x86/ihevcd_function_selector_sse42.c
new file mode 100644
index 0000000..fe46cc4
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector_sse42.c
@@ -0,0 +1,162 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector_sse42.c
+*
+* @brief
+* Contains functions to initialize function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_sse42(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr = &ihevc_deblk_chroma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr = &ihevc_deblk_chroma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr = &ihevc_deblk_luma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr = &ihevc_deblk_luma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr = &ihevc_inter_pred_chroma_copy_sse42;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr = &ihevc_inter_pred_chroma_copy_w16out_sse42;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr = &ihevc_inter_pred_chroma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr = &ihevc_inter_pred_chroma_horz_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr = &ihevc_inter_pred_chroma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr = &ihevc_inter_pred_chroma_vert_w16inp_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr = &ihevc_inter_pred_luma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr = &ihevc_inter_pred_luma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr = &ihevc_inter_pred_luma_vert_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr = &ihevc_inter_pred_luma_vert_w16inp_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr = &ihevc_inter_pred_luma_copy_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr = &ihevc_inter_pred_luma_copy_w16out_sse42;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr = &ihevc_inter_pred_luma_horz_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr = &ihevc_intra_pred_chroma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr = &ihevc_intra_pred_luma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr = &ihevc_intra_pred_luma_ref_subst_all_avlble;
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr = &ihevc_intra_pred_ref_filtering_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr = &ihevc_intra_pred_chroma_dc_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr = &ihevc_intra_pred_chroma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr = &ihevc_intra_pred_chroma_mode2_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr = &ihevc_intra_pred_chroma_mode_18_34_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr = &ihevc_intra_pred_chroma_mode_27_to_33_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr = &ihevc_intra_pred_chroma_mode_3_to_9_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr = &ihevc_intra_pred_chroma_planar_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr = &ihevc_intra_pred_chroma_ver_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr = &ihevc_intra_pred_chroma_mode_11_to_17_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr = &ihevc_intra_pred_chroma_mode_19_to_25_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr = &ihevc_intra_pred_luma_mode_11_to_17_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr = &ihevc_intra_pred_luma_mode_19_to_25_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr = &ihevc_intra_pred_luma_dc_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr = &ihevc_intra_pred_luma_horz_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr = &ihevc_intra_pred_luma_mode2_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr = &ihevc_intra_pred_luma_mode_18_34_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr = &ihevc_intra_pred_luma_mode_27_to_33_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr = &ihevc_intra_pred_luma_mode_3_to_9_sse42;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr = &ihevc_intra_pred_luma_planar_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr = &ihevc_intra_pred_luma_ver_sse42;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr = &ihevc_itrans_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_fptr = &ihevc_itrans_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_8x8_fptr = &ihevc_itrans_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_16x16_fptr = &ihevc_itrans_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_32x32_fptr = &ihevc_itrans_32x32;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr = &ihevc_itrans_recon_4x4_ttype1_sse42;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr = &ihevc_itrans_recon_4x4_sse42;
+ ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr = &ihevc_itrans_recon_8x8_sse42;
+ ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr = &ihevc_itrans_recon_16x16_ssse3;
+ ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr = &ihevc_itrans_recon_32x32_sse42;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr = &ihevc_chroma_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr = &ihevc_chroma_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr = &ihevc_chroma_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr = &ihevc_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_recon_4x4_fptr = &ihevc_recon_4x4;
+ ps_codec->s_func_selector.ihevc_recon_8x8_fptr = &ihevc_recon_8x8;
+ ps_codec->s_func_selector.ihevc_recon_16x16_fptr = &ihevc_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_32x32_fptr = &ihevc_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr = &ihevc_chroma_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr = &ihevc_chroma_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr = &ihevc_chroma_recon_16x16;
+ ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr = &ihevc_memcpy_mul_8;
+ ps_codec->s_func_selector.ihevc_memcpy_fptr = &ihevc_memcpy;
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr = &ihevc_memset_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_fptr = &ihevc_memset;
+ ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr = &ihevc_memset_16bit_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_16bit_fptr = &ihevc_memset_16bit;
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr = &ihevc_pad_left_luma;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr = &ihevc_pad_left_chroma;
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr = &ihevc_pad_right_luma;
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr = &ihevc_pad_right_chroma;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr = &ihevc_weighted_pred_bi_sse42;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr = &ihevc_weighted_pred_bi_default_sse42;
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr = &ihevc_weighted_pred_uni_sse42;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr = &ihevc_weighted_pred_chroma_bi_sse42;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr = &ihevc_weighted_pred_chroma_bi_default_ssse3;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr = &ihevc_weighted_pred_chroma_uni_sse42;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr = &ihevc_sao_band_offset_luma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr = &ihevc_sao_band_offset_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr = &ihevc_sao_edge_offset_class0_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr = &ihevc_sao_edge_offset_class0_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr = &ihevc_sao_edge_offset_class1_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr = &ihevc_sao_edge_offset_class1_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr = &ihevc_sao_edge_offset_class2_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr = &ihevc_sao_edge_offset_class2_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr = &ihevc_sao_edge_offset_class3_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr = &ihevc_sao_edge_offset_class3_chroma_ssse3;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr = &ihevcd_fmt_conv_420sp_to_rgba8888;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr = &ihevcd_fmt_conv_420sp_to_rgb565;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr = &ihevcd_fmt_conv_420sp_to_420p_ssse3;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr = &ihevcd_itrans_recon_dc_luma_sse42;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr = &ihevcd_itrans_recon_dc_chroma_sse42;
+}
diff --git a/decoder/x86/ihevcd_function_selector_ssse3.c b/decoder/x86/ihevcd_function_selector_ssse3.c
new file mode 100644
index 0000000..fdb471a
--- /dev/null
+++ b/decoder/x86/ihevcd_function_selector_ssse3.c
@@ -0,0 +1,162 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_function_selector_ssse3.c
+*
+* @brief
+* Contains functions to initialize function pointers used in hevc
+*
+* @author
+* Naveen
+*
+* @par List of Functions:
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ihevc_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ihevc_defs.h"
+#include "ihevc_debug.h"
+#include "ihevc_structs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevc_cabac_tables.h"
+#include "ihevc_disp_mgr.h"
+#include "ihevc_buf_mgr.h"
+#include "ihevc_dpb_mgr.h"
+#include "ihevc_error.h"
+
+#include "ihevcd_defs.h"
+#include "ihevcd_function_selector.h"
+#include "ihevcd_structs.h"
+
+void ihevcd_init_function_ptr_ssse3(void *pv_codec)
+{
+ codec_t *ps_codec = (codec_t *)pv_codec;
+
+ ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr = &ihevc_deblk_chroma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr = &ihevc_deblk_chroma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr = &ihevc_deblk_luma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr = &ihevc_deblk_luma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_fptr = &ihevc_inter_pred_chroma_copy_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_copy_w16out_fptr = &ihevc_inter_pred_chroma_copy_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_fptr = &ihevc_inter_pred_chroma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_horz_w16out_fptr = &ihevc_inter_pred_chroma_horz_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_fptr = &ihevc_inter_pred_chroma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_fptr = &ihevc_inter_pred_chroma_vert_w16inp_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_chroma_vert_w16out_fptr = &ihevc_inter_pred_chroma_vert_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_fptr = &ihevc_inter_pred_luma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_fptr = &ihevc_inter_pred_luma_vert_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16out_fptr = &ihevc_inter_pred_luma_vert_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_fptr = &ihevc_inter_pred_luma_vert_w16inp_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_fptr = &ihevc_inter_pred_luma_copy_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_copy_w16out_fptr = &ihevc_inter_pred_luma_copy_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_horz_w16out_fptr = &ihevc_inter_pred_luma_horz_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_inter_pred_luma_vert_w16inp_w16out_fptr = &ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr = &ihevc_intra_pred_chroma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr = &ihevc_intra_pred_luma_ref_substitution;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr = &ihevc_intra_pred_luma_ref_subst_all_avlble;
+ ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr = &ihevc_intra_pred_ref_filtering_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_dc_fptr = &ihevc_intra_pred_chroma_dc_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_horz_fptr = &ihevc_intra_pred_chroma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode2_fptr = &ihevc_intra_pred_chroma_mode2_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_18_34_fptr = &ihevc_intra_pred_chroma_mode_18_34_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_27_to_33_fptr = &ihevc_intra_pred_chroma_mode_27_to_33_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_3_to_9_fptr = &ihevc_intra_pred_chroma_mode_3_to_9_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_planar_fptr = &ihevc_intra_pred_chroma_planar_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_ver_fptr = &ihevc_intra_pred_chroma_ver_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_11_to_17_fptr = &ihevc_intra_pred_chroma_mode_11_to_17_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_chroma_mode_19_to_25_fptr = &ihevc_intra_pred_chroma_mode_19_to_25_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_11_to_17_fptr = &ihevc_intra_pred_luma_mode_11_to_17_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_19_to_25_fptr = &ihevc_intra_pred_luma_mode_19_to_25_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_dc_fptr = &ihevc_intra_pred_luma_dc_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_horz_fptr = &ihevc_intra_pred_luma_horz_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode2_fptr = &ihevc_intra_pred_luma_mode2_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_18_34_fptr = &ihevc_intra_pred_luma_mode_18_34_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_27_to_33_fptr = &ihevc_intra_pred_luma_mode_27_to_33_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_mode_3_to_9_fptr = &ihevc_intra_pred_luma_mode_3_to_9_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_planar_fptr = &ihevc_intra_pred_luma_planar_ssse3;
+ ps_codec->s_func_selector.ihevc_intra_pred_luma_ver_fptr = &ihevc_intra_pred_luma_ver_ssse3;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_ttype1_fptr = &ihevc_itrans_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_itrans_4x4_fptr = &ihevc_itrans_4x4;
+ ps_codec->s_func_selector.ihevc_itrans_8x8_fptr = &ihevc_itrans_8x8;
+ ps_codec->s_func_selector.ihevc_itrans_16x16_fptr = &ihevc_itrans_16x16;
+ ps_codec->s_func_selector.ihevc_itrans_32x32_fptr = &ihevc_itrans_32x32;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_ttype1_fptr = &ihevc_itrans_recon_4x4_ttype1_ssse3;
+ ps_codec->s_func_selector.ihevc_itrans_recon_4x4_fptr = &ihevc_itrans_recon_4x4_ssse3;
+ ps_codec->s_func_selector.ihevc_itrans_recon_8x8_fptr = &ihevc_itrans_recon_8x8_ssse3;
+ ps_codec->s_func_selector.ihevc_itrans_recon_16x16_fptr = &ihevc_itrans_recon_16x16_ssse3;
+ ps_codec->s_func_selector.ihevc_itrans_recon_32x32_fptr = &ihevc_itrans_recon_32x32_ssse3;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_4x4_fptr = &ihevc_chroma_itrans_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_8x8_fptr = &ihevc_chroma_itrans_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_itrans_recon_16x16_fptr = &ihevc_chroma_itrans_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_4x4_ttype1_fptr = &ihevc_recon_4x4_ttype1;
+ ps_codec->s_func_selector.ihevc_recon_4x4_fptr = &ihevc_recon_4x4;
+ ps_codec->s_func_selector.ihevc_recon_8x8_fptr = &ihevc_recon_8x8;
+ ps_codec->s_func_selector.ihevc_recon_16x16_fptr = &ihevc_recon_16x16;
+ ps_codec->s_func_selector.ihevc_recon_32x32_fptr = &ihevc_recon_32x32;
+ ps_codec->s_func_selector.ihevc_chroma_recon_4x4_fptr = &ihevc_chroma_recon_4x4;
+ ps_codec->s_func_selector.ihevc_chroma_recon_8x8_fptr = &ihevc_chroma_recon_8x8;
+ ps_codec->s_func_selector.ihevc_chroma_recon_16x16_fptr = &ihevc_chroma_recon_16x16;
+ ps_codec->s_func_selector.ihevc_memcpy_mul_8_fptr = &ihevc_memcpy_mul_8;
+ ps_codec->s_func_selector.ihevc_memcpy_fptr = &ihevc_memcpy;
+ ps_codec->s_func_selector.ihevc_memset_mul_8_fptr = &ihevc_memset_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_fptr = &ihevc_memset;
+ ps_codec->s_func_selector.ihevc_memset_16bit_mul_8_fptr = &ihevc_memset_16bit_mul_8;
+ ps_codec->s_func_selector.ihevc_memset_16bit_fptr = &ihevc_memset_16bit;
+ ps_codec->s_func_selector.ihevc_pad_left_luma_fptr = &ihevc_pad_left_luma;
+ ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr = &ihevc_pad_left_chroma;
+ ps_codec->s_func_selector.ihevc_pad_right_luma_fptr = &ihevc_pad_right_luma;
+ ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr = &ihevc_pad_right_chroma;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_fptr = &ihevc_weighted_pred_bi_ssse3;
+ ps_codec->s_func_selector.ihevc_weighted_pred_bi_default_fptr = &ihevc_weighted_pred_bi_default_ssse3;
+ ps_codec->s_func_selector.ihevc_weighted_pred_uni_fptr = &ihevc_weighted_pred_uni_ssse3;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_fptr = &ihevc_weighted_pred_chroma_bi_ssse3;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_bi_default_fptr = &ihevc_weighted_pred_chroma_bi_default_ssse3;
+ ps_codec->s_func_selector.ihevc_weighted_pred_chroma_uni_fptr = &ihevc_weighted_pred_chroma_uni_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_luma_fptr = &ihevc_sao_band_offset_luma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_band_offset_chroma_fptr = &ihevc_sao_band_offset_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_fptr = &ihevc_sao_edge_offset_class0_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class0_chroma_fptr = &ihevc_sao_edge_offset_class0_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_fptr = &ihevc_sao_edge_offset_class1_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class1_chroma_fptr = &ihevc_sao_edge_offset_class1_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_fptr = &ihevc_sao_edge_offset_class2_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class2_chroma_fptr = &ihevc_sao_edge_offset_class2_chroma_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_fptr = &ihevc_sao_edge_offset_class3_ssse3;
+ ps_codec->s_func_selector.ihevc_sao_edge_offset_class3_chroma_fptr = &ihevc_sao_edge_offset_class3_chroma_ssse3;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgba8888_fptr = &ihevcd_fmt_conv_420sp_to_rgba8888;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_rgb565_fptr = &ihevcd_fmt_conv_420sp_to_rgb565;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420sp_fptr = &ihevcd_fmt_conv_420sp_to_420sp;
+ ps_codec->s_func_selector.ihevcd_fmt_conv_420sp_to_420p_fptr = &ihevcd_fmt_conv_420sp_to_420p;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_luma_fptr = &ihevcd_itrans_recon_dc_luma_ssse3;
+ ps_codec->s_func_selector.ihevcd_itrans_recon_dc_chroma_fptr = &ihevcd_itrans_recon_dc_chroma_ssse3;
+}
diff --git a/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c b/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
new file mode 100644
index 0000000..55fa21b
--- /dev/null
+++ b/decoder/x86/ihevcd_it_rec_dc_sse42_intr.c
@@ -0,0 +1,401 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_it_rec_dc_x86_intr.c
+*
+* @brief
+* Platform specific intrinsic implementation of certain functions
+*
+* @author
+* Ittiam
+* @par List of Functions:
+* - ihevcd_itrans_recon_dc
+* - ihevcd_fmt_conv_420sp_to_420p
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevcd_function_selector.h"
+
+#include <immintrin.h>
+
+
+void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+ WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_8;
+ __m128i m_temp_reg_9;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20, zero_8x16b;
+ __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+ //WORD32 row,col;
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+
+
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ /*Replicate the DC value within 16 bits in 128 bit register*/
+ m_temp_reg_20 = _mm_set1_epi16(dc_value);
+ zero_8x16b = _mm_setzero_si128();
+
+ if(trans_size == 4)
+ {
+ WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
+ m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+
+ m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+ m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
+ m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
+ m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+ }
+ else
+ {
+ WORD32 i, j;
+
+ for(i = 1; i <= trans_size; i += 4)
+ {
+ for(j = 1; j <= trans_size; j += 8)
+ {
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
+
+ m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+ m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+ m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+ m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+ m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+ m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+
+ pu1_pred += 8;
+ pu1_dst += 8;
+ }
+ pu1_pred += 4 * pred_strd - trans_size;
+ pu1_dst += 4 * dst_strd - trans_size;
+ }
+ }
+
+
+}
+
+void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+ WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_8;
+ __m128i m_temp_reg_9;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20, zero_8x16b;
+ __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+ //WORD32 row,col;
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+
+ WORD32 shuffle_mask_4x4 = 0x06040200;
+ WORD32 unchanged_mask_4x4 = 0x07050301;
+ LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
+ LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ /*Replicate the DC value within 16 bits in 128 bit register*/
+ m_temp_reg_20 = _mm_set1_epi16(dc_value);
+ zero_8x16b = _mm_setzero_si128();
+
+ if(trans_size == 4)
+ {
+ __m128i chroma_shuffle_mask_16x8b;
+ __m128i chroma_unchanged_mask_16x8b;
+ chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
+ chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
+
+ /*Load the prediction data*/
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+ m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+ m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+ m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+ m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
+
+ m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+ /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
+
+ m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+ m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+ m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+ m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+
+ m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+ m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
+
+ /*Store the result in the destination*/
+ _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+ }
+ else
+ {
+ WORD32 i, j;
+ __m128i chroma_shuffle_mask_16x8b;
+ __m128i chroma_unchanged_mask_16x8b;
+ chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
+ chroma_unchanged_mask_16x8b =
+ _mm_loadl_epi64((__m128i *)(&unchanged_mask));
+
+ for(i = 0; i < trans_size; i += 4)
+ {
+ for(j = 0; j < trans_size; j += 8)
+ {
+
+ m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
+ m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ /*Retain only one chroma component*/
+ m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+ m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+ m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+ m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
+
+ m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+ m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+ m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+
+ /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+ m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
+ m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
+ m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
+ m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
+
+ m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+ m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+ m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+ m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+ m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+ m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+
+ m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
+ m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
+ m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
+
+ m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
+ m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
+ m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
+
+ /*Store the result in the destination*/
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_8);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+ m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+ m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+ m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+
+ pu1_pred += 16;
+ pu1_dst += 16;
+ }
+
+ pu1_pred += 4 * pred_strd - 2 * trans_size;
+ pu1_dst += 4 * dst_strd - 2 * trans_size;
+ }
+ }
+
+
+}
diff --git a/decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c b/decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c
new file mode 100644
index 0000000..2857a07
--- /dev/null
+++ b/decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c
@@ -0,0 +1,401 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/**
+*******************************************************************************
+* @file
+* ihevcd_it_rec_dc_atom_intr.c
+*
+* @brief
+* Platform specific intrinsic implementation of certain functions
+*
+* @author
+* Ittiam
+* @par List of Functions:
+* - ihevcd_itrans_recon_dc
+* - ihevcd_fmt_conv_420sp_to_420p
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include "ihevc_typedefs.h"
+#include "ihevc_defs.h"
+#include "ihevc_macros.h"
+#include "ihevc_platform_macros.h"
+#include "ihevcd_function_selector.h"
+
+#include <immintrin.h>
+
+
+
+
+void ihevcd_itrans_recon_dc_luma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+ WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_8;
+ __m128i m_temp_reg_9;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20, zero_8x16b;
+ __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ /*Replicate the DC value within 16 bits in 128 bit register*/
+ m_temp_reg_20 = _mm_set1_epi16(dc_value);
+ zero_8x16b = _mm_setzero_si128();
+
+ if(trans_size == 4)
+ {
+ WORD32 *pi4_dst = (WORD32 *)pu1_dst;
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
+ m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+
+ m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+ m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
+ m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
+ m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
+ pu1_dst += dst_strd;
+ pi4_dst = (WORD32 *)(pu1_dst);
+
+ *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
+ }
+ else
+ {
+ WORD32 i, j;
+
+ for(i = 1; i <= trans_size; i += 4)
+ {
+ for(j = 1; j <= trans_size; j += 8)
+ {
+
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
+
+ m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+ m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+ m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+ m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+ m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+ m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+
+ pu1_pred += 8;
+ pu1_dst += 8;
+ }
+ pu1_pred += 4 * pred_strd - trans_size;
+ pu1_dst += 4 * dst_strd - trans_size;
+ }
+ }
+
+
+}
+
+void ihevcd_itrans_recon_dc_chroma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
+ WORD32 log2_trans_size, WORD16 i2_coeff_value)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_8;
+ __m128i m_temp_reg_9;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_20, zero_8x16b;
+ __m128i *pi4_dst = (__m128i *)pu1_dst;
+
+
+ WORD32 add, shift;
+ WORD32 dc_value, quant_out;
+ WORD32 trans_size;
+
+
+ WORD32 shuffle_mask_4x4 = 0x06040200;
+ WORD32 unchanged_mask_4x4 = 0x07050301;
+ LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
+ LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
+
+ trans_size = (1 << log2_trans_size);
+
+ quant_out = i2_coeff_value;
+
+ shift = IT_SHIFT_STAGE_1;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
+ shift = IT_SHIFT_STAGE_2;
+ add = 1 << (shift - 1);
+ dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
+
+ /*Replicate the DC value within 16 bits in 128 bit register*/
+ m_temp_reg_20 = _mm_set1_epi16(dc_value);
+ zero_8x16b = _mm_setzero_si128();
+
+ if(trans_size == 4)
+ {
+ __m128i chroma_shuffle_mask_16x8b;
+ __m128i chroma_unchanged_mask_16x8b;
+ chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
+ chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
+
+ /*Load the prediction data*/
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+ m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+ m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+ m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+ m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
+
+ m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+
+ /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
+
+ m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+ m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+ m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+ m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+
+ m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
+ m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
+ m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
+
+ /*Store the result in the destination*/
+ _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+ pu1_dst += dst_strd;
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+ }
+ else
+ {
+ WORD32 i, j;
+ __m128i chroma_shuffle_mask_16x8b;
+ __m128i chroma_unchanged_mask_16x8b;
+ chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
+ chroma_unchanged_mask_16x8b =
+ _mm_loadl_epi64((__m128i *)(&unchanged_mask));
+
+ for(i = 0; i < trans_size; i += 4)
+ {
+ for(j = 0; j < trans_size; j += 8)
+ {
+
+ m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
+ m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
+ m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
+ m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
+
+ /*Retain only one chroma component*/
+ m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
+ m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
+ m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
+ m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
+
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
+
+ m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
+ m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
+ m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
+ m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
+
+
+ /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
+ m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
+ m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
+ m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
+ m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
+
+ m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
+ m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
+ m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
+ m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
+
+ m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
+ m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
+
+ m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
+ m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
+ m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
+
+ m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
+ m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
+ m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
+
+ /*Store the result in the destination*/
+ pi4_dst = (__m128i *)(pu1_dst);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_12);
+ m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_8);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_13);
+ m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_9);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_14);
+ m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_10);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
+
+ _mm_storel_epi64(pi4_dst, m_temp_reg_15);
+ m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
+
+ pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
+ _mm_storel_epi64(pi4_dst, m_temp_reg_11);
+
+ pu1_pred += 16;
+ pu1_dst += 16;
+ }
+
+ pu1_pred += 4 * pred_strd - 2 * trans_size;
+ pu1_dst += 4 * dst_strd - 2 * trans_size;
+ }
+ }
+
+
+}
diff --git a/test/Android.mk b/test/Android.mk
new file mode 100644
index 0000000..7807003
--- /dev/null
+++ b/test/Android.mk
@@ -0,0 +1,5 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+# decoder
+include $(LOCAL_PATH)/decoder.mk
diff --git a/test/decoder.mk b/test/decoder.mk
new file mode 100644
index 0000000..ef560b3
--- /dev/null
+++ b/test/decoder.mk
@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := hevcdec
+LOCAL_MODULE_TAGS := optional
+
+LOCAL_CFLAGS := -DPROFILE_ENABLE -DARM -fPIC
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/../decoder $(LOCAL_PATH)/../common $(LOCAL_PATH)/
+LOCAL_SRC_FILES := decoder/main.c
+LOCAL_STATIC_LIBRARIES := libhevcdec
+
+include $(BUILD_EXECUTABLE)
diff --git a/test/decoder/main.c b/test/decoder/main.c
new file mode 100644
index 0000000..a4bf626
--- /dev/null
+++ b/test/decoder/main.c
@@ -0,0 +1,3169 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+/*****************************************************************************/
+/* */
+/* File Name : main.c */
+/* */
+/* Description : Contains an application that demonstrates use of HEVC*/
+/* decoder API */
+/* */
+/* List of Functions : */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 Harish Initial Version */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef X86_MINGW
+#include <signal.h>
+#endif
+
+#ifndef IOS
+#include <malloc.h>
+#endif
+#ifdef IOS_DISPLAY
+#include "cast_types.h"
+#else
+#include "ihevc_typedefs.h"
+#endif
+
+#include "iv.h"
+#include "ivd.h"
+#include "ihevcd_cxa.h"
+#include "ithread.h"
+
+
+#define MD5_DISABLE
+#ifdef X86_MSVC
+#include <windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+#define ALIGN8(x) ((((x) + 7) >> 3) << 3)
+#define NUM_DISPLAY_BUFFERS 4
+#define DEFAULT_FPS 30
+
+
+#define ENABLE_DEGRADE 0
+#define MAX_DISP_BUFFERS 64
+#define EXTRA_DISP_BUFFERS 0
+#define STRLENGTH 1000
+
+//#define TEST_FLUSH
+#define FLUSH_FRM_CNT 100
+
+
+#ifdef IOS
+#define PATHLENMAX 500
+char filename_with_path[PATHLENMAX];
+#endif
+
+#ifdef PROFILE_ENABLE
+#ifdef X86_MSVC
+typedef LARGE_INTEGER TIMER;
+#else
+//#ifdef X86_MINGW
+typedef struct timeval TIMER;
+//#endif
+#endif
+#else
+typedef WORD32 TIMER;
+#endif
+
+#ifdef PROFILE_ENABLE
+#ifdef X86_MSVC
+#define GETTIME(timer) QueryPerformanceCounter(timer);
+#else
+//#ifdef X86_MINGW
+#define GETTIME(timer) gettimeofday(timer,NULL);
+//#endif
+#endif
+
+#ifdef X86_MSVC
+#define ELAPSEDTIME(s_start_timer,s_end_timer, s_elapsed_time, frequency) \
+ { \
+ TIMER s_temp_time; \
+ s_temp_time.LowPart = s_end_timer.LowPart - s_start_timer.LowPart ; \
+ s_elapsed_time = (UWORD32) ( ((DOUBLE)s_temp_time.LowPart / (DOUBLE)frequency.LowPart ) * 1000000); \
+ }
+#else
+//#ifdef X86_MINGW
+#define ELAPSEDTIME(s_start_timer,s_end_timer, s_elapsed_time, frequency) \
+ s_elapsed_time = ((s_end_timer.tv_sec - s_start_timer.tv_sec) * 1000000) + (s_end_timer.tv_usec - s_start_timer.tv_usec);
+//#endif
+#endif
+
+#else
+#define GETTIME(timer)
+#define ELAPSEDTIME(s_start_timer,s_end_timer, s_elapsed_time, frequency)
+#endif
+
+
+/* Function declarations */
+#ifndef MD5_DISABLE
+void calc_md5_cksum(UWORD8 *pu1_inbuf, UWORD32 u4_stride, UWORD32 u4_width, UWORD32 u4_height, UWORD8 *pu1_cksum_p);
+#else
+#define calc_md5_cksum(a, b, c, d, e)
+#endif
+#ifdef SDL_DISPLAY
+void* sdl_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void sdl_alloc_disp_buffers(void *);
+void sdl_display(void *, WORD32);
+void sdl_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void sdl_disp_deinit(void *);
+void sdl_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T sdl_get_color_fmt(void);
+UWORD32 sdl_get_stride(void);
+#endif
+
+#ifdef INTEL_CE5300
+void* gdl_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void gdl_alloc_disp_buffers(void *);
+void gdl_display(void *, WORD32);
+void gdl_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void gdl_disp_deinit(void *);
+void gdl_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T gdl_get_color_fmt(void);
+UWORD32 gdl_get_stride(void);
+#endif
+
+#ifdef FBDEV_DISPLAY
+void* fbd_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void fbd_alloc_disp_buffers(void *);
+void fbd_display(void *, WORD32);
+void fbd_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void fbd_disp_deinit(void *);
+void fbd_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T fbd_get_color_fmt(void);
+UWORD32 fbd_get_stride(void);
+#endif
+
+#ifdef IOS_DISPLAY
+void* ios_disp_init(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+void ios_alloc_disp_buffers(void *);
+void ios_display(void *, WORD32);
+void ios_set_disp_buffers(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+void ios_disp_deinit(void *);
+void ios_disp_usleep(UWORD32);
+IV_COLOR_FORMAT_T ios_get_color_fmt(void);
+UWORD32 ios_get_stride(void);
+#endif
+
+typedef struct
+{
+ UWORD32 u4_piclen_flag;
+ UWORD32 u4_file_save_flag;
+ UWORD32 u4_chksum_save_flag;
+ UWORD32 u4_max_frm_ts;
+ IV_COLOR_FORMAT_T e_output_chroma_format;
+ IVD_ARCH_T e_arch;
+ IVD_SOC_T e_soc;
+ UWORD32 dump_q_rd_idx;
+ UWORD32 dump_q_wr_idx;
+ WORD32 disp_q_wr_idx;
+ WORD32 disp_q_rd_idx;
+
+ void *cocodec_obj;
+ UWORD32 share_disp_buf;
+ UWORD32 num_disp_buf;
+ UWORD32 b_pic_present;
+ WORD32 i4_degrade_type;
+ WORD32 i4_degrade_pics;
+ UWORD32 u4_num_cores;
+#ifdef GPU_BUILD
+ UWORD32 u4_gpu_enable_diable;
+#endif
+ UWORD32 disp_delay;
+ WORD32 trace_enable;
+ CHAR ac_trace_fname[STRLENGTH];
+ CHAR ac_piclen_fname[STRLENGTH];
+ CHAR ac_ip_fname[STRLENGTH];
+ CHAR ac_op_fname[STRLENGTH];
+ CHAR ac_op_chksum_fname[STRLENGTH];
+ ivd_out_bufdesc_t s_disp_buffers[MAX_DISP_BUFFERS];
+ iv_yuv_buf_t s_disp_frm_queue[MAX_DISP_BUFFERS];
+ UWORD32 s_disp_frm_id_queue[MAX_DISP_BUFFERS];
+ UWORD32 loopback;
+ UWORD32 display;
+ UWORD32 full_screen;
+ UWORD32 fps;
+ UWORD32 max_wd;
+ UWORD32 max_ht;
+ UWORD32 max_level;
+
+ UWORD32 u4_strd;
+
+ /* For signalling to display thread */
+ UWORD32 u4_pic_wd;
+ UWORD32 u4_pic_ht;
+
+ /* For IOS diplay */
+ WORD32 i4_screen_wd;
+ WORD32 i4_screen_ht;
+
+ //UWORD32 u4_output_present;
+ WORD32 quit;
+ WORD32 paused;
+
+
+ void *pv_disp_ctx;
+ void *display_thread_handle;
+ WORD32 display_thread_created;
+ volatile WORD32 display_init_done;
+ volatile WORD32 display_deinit_flag;
+
+ void* (*disp_init)(UWORD32, UWORD32, WORD32, WORD32, WORD32, WORD32, WORD32, WORD32 *, WORD32 *);
+ void (*alloc_disp_buffers)(void *);
+ void (*display_buffer)(void *, WORD32);
+ void (*set_disp_buffers)(void *, WORD32, UWORD8 **, UWORD8 **, UWORD8 **);
+ void (*disp_deinit)(void *);
+ void (*disp_usleep)(UWORD32);
+ IV_COLOR_FORMAT_T(*get_color_fmt)(void);
+ UWORD32(*get_stride)(void);
+}vid_dec_ctx_t;
+
+
+
+typedef enum
+{
+ INVALID,
+ HELP,
+ VERSION,
+ INPUT_FILE,
+ OUTPUT,
+ CHKSUM,
+ SAVE_OUTPUT,
+ SAVE_CHKSUM,
+ CHROMA_FORMAT,
+ NUM_FRAMES,
+ NUM_CORES,
+
+ SHARE_DISPLAY_BUF,
+#ifdef GPU_BUILD
+ ENABLE_GPU,
+#endif
+ LOOPBACK,
+ DISPLAY,
+ FULLSCREEN,
+ FPS,
+ TRACE,
+ MAX_WD,
+ MAX_HT,
+ MAX_LEVEL,
+ CONFIG,
+
+ DEGRADE_TYPE,
+ DEGRADE_PICS,
+ ARCH,
+ SOC,
+ PICLEN,
+ PICLEN_FILE,
+}ARGUMENT_T;
+
+typedef struct
+{
+ CHAR argument_shortname[4];
+ CHAR argument_name[128];
+ ARGUMENT_T argument;
+ CHAR description[512];
+}argument_t;
+
+static const argument_t argument_mapping[] =
+{
+ { "-h", "--help", HELP,
+ "Print this help\n" },
+ { "-c", "--config", CONFIG,
+ "config file (Default: test.cfg)\n" },
+
+ { "-v", "--version", VERSION,
+ "Version information\n" },
+ { "-i", "--input", INPUT_FILE,
+ "Input file\n" },
+ { "-o", "--output", OUTPUT,
+ "Output file\n" },
+ { "--", "--piclen", PICLEN,
+ "Flag to signal if the decoder has to use a file containing number of bytes in each picture to be fed in each call\n" },
+ { "--", "--piclen_file", PICLEN_FILE,
+ "File containing number of bytes in each picture - each line containing one size\n" },
+ { "--", "--chksum", CHKSUM,
+ "Output MD5 Checksum file\n" },
+ { "-s", "--save_output", SAVE_OUTPUT,
+ "Save Output file\n" },
+ { "--", "--save_chksum", SAVE_CHKSUM,
+ "Save Check sum file\n" },
+ { "--", "--chroma_format", CHROMA_FORMAT,
+ "Output Chroma format Supported values YUV_420P, YUV_422ILE, RGB_565, YUV_420SP_UV, YUV_420SP_VU\n" },
+ { "-n", "--num_frames", NUM_FRAMES,
+ "Number of frames to be decoded\n" },
+ { "--", "--num_cores", NUM_CORES,
+ "Number of cores to be used\n" },
+ { "--", "--degrade_type", DEGRADE_TYPE,
+ "Degrade type : 0: No degrade 0th bit set : Disable SAO 1st bit set : Disable deblocking 2nd bit set : Faster inter prediction filters 3rd bit set : Fastest inter prediction filters\n" },
+ { "--", "--degrade_pics", DEGRADE_PICS,
+ "Degrade pics : 0 : No degrade 1 : Only on non-reference frames 2 : Do not degrade every 4th or key frames 3 : All non-key frames 4 : All frames" },
+ { "--", "--share_display_buf", SHARE_DISPLAY_BUF,
+ "Enable shared display buffer mode\n" },
+ { "--", "--loopback", LOOPBACK,
+ "Enable playback in a loop\n" },
+ { "--", "--display", DISPLAY,
+ "Enable display (uses SDL)\n" },
+ { "--", "--fullscreen", FULLSCREEN,
+ "Enable full screen (Only for GDL and SDL)\n" },
+ { "--", "--fps", FPS,
+ "FPS to be used for display \n" },
+#ifdef GPU_BUILD
+ { "--", "--enable_gpu", ENABLE_GPU,
+ "Enable shared display buffer mode\n" },
+#endif
+ { "-i", "--trace", TRACE,
+ "Trace file\n" },
+ { "--", "--max_wd", MAX_WD,
+ "Maximum width (Default: 2560) \n" },
+ { "--", "--max_ht", MAX_HT,
+ "Maximum height (Default: 1600)\n" },
+ { "--", "--max_level", MAX_LEVEL,
+ "Maximum Decoder Level (Default: 50)\n" },
+ { "--", "--arch", ARCH,
+ "Set Architecture. Supported values ARM_NONEON, ARM_A9Q, ARM_A7, ARM_A5, ARM_NEONINTR, X86_GENERIC, X86_SSSE3, X86_SSE4 \n" },
+ { "--", "--soc", SOC,
+ "Set SOC. Supported values GENERIC, HISI_37X \n" },
+};
+
+#define PEAK_WINDOW_SIZE 8
+#define MAX_FRAME_WIDTH 2560
+#define MAX_FRAME_HEIGHT 1600
+#define MAX_LEVEL_SUPPORTED 50
+#define MAX_REF_FRAMES 16
+#define MAX_REORDER_FRAMES 16
+#define DEFAULT_SHARE_DISPLAY_BUF 0
+#define STRIDE 0
+#define DEFAULT_NUM_CORES 1
+
+#define DUMP_SINGLE_BUF 0
+#define IV_ISFATALERROR(x) (((x) >> IVD_FATALERROR) & 0x1)
+
+#define ivd_cxa_api_function ihevcd_cxa_api_function
+
+#ifdef IOS
+char filename_trace[PATHLENMAX];
+#endif
+
+#if ANDROID_NDK
+/*****************************************************************************/
+/* */
+/* Function Name : raise */
+/* */
+/* Description : Needed as a workaround when the application is built in */
+/* Android NDK. This is an exception to be called for divide*/
+/* by zero error */
+/* */
+/* Inputs : a */
+/* Globals : */
+/* Processing : None */
+/* */
+/* Outputs : */
+/* Returns : */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+int raise(int a)
+{
+ printf("Divide by zero\n");
+ return 0;
+}
+#endif
+
+#ifdef _WIN32
+/*****************************************************************************/
+/* Function to print library calls */
+/*****************************************************************************/
+/*****************************************************************************/
+/* */
+/* Function Name : memalign */
+/* */
+/* Description : Returns malloc data. Ideally should return aligned memory*/
+/* support alignment will be added later */
+/* */
+/* Inputs : alignment */
+/* size */
+/* Globals : */
+/* Processing : */
+/* */
+/* Outputs : */
+/* Returns : */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+void* ihevca_aligned_malloc(WORD32 alignment, WORD32 size)
+{
+ return (void *)_aligned_malloc(size, alignment);
+}
+
+void ihevca_aligned_free(void *pv_buf)
+{
+ _aligned_free(pv_buf);
+ return;
+}
+#endif
+
+#if IOS
+void* ihevca_aligned_malloc(WORD32 alignment, WORD32 size)
+{
+ return malloc(size);
+}
+
+void ihevca_aligned_free(void *pv_buf)
+{
+ free(pv_buf);
+ return;
+}
+#endif
+
+#if (!defined(IOS)) && (!defined(_WIN32))
+void* ihevca_aligned_malloc(WORD32 alignment, WORD32 size)
+{
+ return memalign(alignment, size);
+}
+
+void ihevca_aligned_free(void *pv_buf)
+{
+ free(pv_buf);
+ return;
+}
+#endif
+/*****************************************************************************/
+/* */
+/* Function Name : set_degrade */
+/* */
+/* Description : Control call to set degrade level */
+/* */
+/* */
+/* Inputs : codec_obj - Codec Handle */
+/* type - degrade level value between 0 to 4 */
+/* 0 : No degrade */
+/* 1st bit : Disable SAO */
+/* 2nd bit : Disable Deblock */
+/* 3rd bit : Faster MC for non-ref */
+/* 4th bit : Fastest MC for non-ref */
+/* pics - Pictures that are are degraded */
+/* 0 : No degrade */
+/* 1 : Non-ref pictures */
+/* 2 : Pictures at given interval are not degraded */
+/* 3 : All non-key pictures */
+/* 4 : All pictures */
+/* Globals : */
+/* Processing : Calls degrade control to the codec */
+/* */
+/* Outputs : */
+/* Returns : Control call return status */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T set_degrade(void *codec_obj, UWORD32 type, WORD32 pics)
+{
+ ihevcd_cxa_ctl_degrade_ip_t s_ctl_ip;
+ ihevcd_cxa_ctl_degrade_op_t s_ctl_op;
+ void *pv_api_ip, *pv_api_op;
+ IV_API_CALL_STATUS_T e_dec_status;
+
+ s_ctl_ip.u4_size = sizeof(ihevcd_cxa_ctl_degrade_ip_t);
+ s_ctl_ip.i4_degrade_type = type;
+ s_ctl_ip.i4_nondegrade_interval = 4;
+ s_ctl_ip.i4_degrade_pics = pics;
+
+ s_ctl_op.u4_size = sizeof(ihevcd_cxa_ctl_degrade_op_t);
+
+ pv_api_ip = (void *)&s_ctl_ip;
+ pv_api_op = (void *)&s_ctl_op;
+
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_DEGRADE;
+
+ e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, pv_api_ip, pv_api_op);
+
+ if(IV_SUCCESS != e_dec_status)
+ {
+ printf("Error in setting degrade level \n");
+ }
+ return (e_dec_status);
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : enable_skipb_frames */
+/* */
+/* Description : Control call to enable skipping of b frames */
+/* */
+/* */
+/* Inputs : codec_obj : Codec handle */
+/* Globals : */
+/* Processing : Calls enable skip B frames control */
+/* */
+/* Outputs : */
+/* Returns : Control call return status */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T enable_skipb_frames(void *codec_obj,
+ vid_dec_ctx_t *ps_app_ctx)
+{
+ ivd_ctl_set_config_ip_t s_ctl_ip;
+ ivd_ctl_set_config_op_t s_ctl_op;
+ IV_API_CALL_STATUS_T e_dec_status;
+
+ s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+ s_ctl_ip.e_frm_skip_mode = IVD_SKIP_B;
+
+ s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+ s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+ e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+
+ if(IV_SUCCESS != e_dec_status)
+ {
+ printf("Error in Enable SkipB frames \n");
+ }
+
+ return e_dec_status;
+}
+/*****************************************************************************/
+/* */
+/* Function Name : disable_skipb_frames */
+/* */
+/* Description : Control call to disable skipping of b frames */
+/* */
+/* */
+/* Inputs : codec_obj : Codec handle */
+/* Globals : */
+/* Processing : Calls disable B frame skip control */
+/* */
+/* Outputs : */
+/* Returns : Control call return status */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T disable_skipb_frames(void *codec_obj,
+ vid_dec_ctx_t *ps_app_ctx)
+{
+ ivd_ctl_set_config_ip_t s_ctl_ip;
+ ivd_ctl_set_config_op_t s_ctl_op;
+ IV_API_CALL_STATUS_T e_dec_status;
+
+ s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+ s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+
+ s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+ s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+ e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+
+ if(IV_SUCCESS != e_dec_status)
+ {
+ printf("Error in Disable SkipB frames\n");
+ }
+
+ return e_dec_status;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : enable_skippb_frames */
+/* */
+/* Description : Control call to enable skipping of P & B frames */
+/* */
+/* */
+/* Inputs : codec_obj : Codec handle */
+/* Globals : */
+/* Processing : Calls enable skip P and B frames control */
+/* */
+/* Outputs : */
+/* Returns : Control call return status */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T enable_skippb_frames(void *codec_obj,
+ vid_dec_ctx_t *ps_app_ctx)
+{
+ ivd_ctl_set_config_ip_t s_ctl_ip;
+ ivd_ctl_set_config_op_t s_ctl_op;
+ IV_API_CALL_STATUS_T e_dec_status;
+
+ s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+ s_ctl_ip.e_frm_skip_mode = IVD_SKIP_PB;
+
+ s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+ s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+ e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+ if(IV_SUCCESS != e_dec_status)
+ {
+ printf("Error in Enable SkipPB frames\n");
+ }
+
+ return e_dec_status;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : disable_skippb_frames */
+/* */
+/* Description : Control call to disable skipping of P and B frames */
+/* */
+/* */
+/* Inputs : codec_obj : Codec handle */
+/* Globals : */
+/* Processing : Calls disable P and B frame skip control */
+/* */
+/* Outputs : */
+/* Returns : Control call return status */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T disable_skippb_frames(void *codec_obj,
+ vid_dec_ctx_t *ps_app_ctx)
+{
+ ivd_ctl_set_config_ip_t s_ctl_ip;
+ ivd_ctl_set_config_op_t s_ctl_op;
+ IV_API_CALL_STATUS_T e_dec_status;
+
+ s_ctl_ip.u4_disp_wd = ps_app_ctx->u4_strd;
+ s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+
+ s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+ s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+ e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+ if(IV_SUCCESS != e_dec_status)
+ {
+ printf("Error in Disable SkipPB frames\n");
+ }
+
+ return e_dec_status;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : release_disp_frame */
+/* */
+/* Description : Calls release display control - Used to signal to the */
+/* decoder that this particular buffer has been displayed */
+/* and that the codec is now free to write to this buffer */
+/* */
+/* */
+/* Inputs : codec_obj : Codec Handle */
+/* buf_id : Buffer Id of the buffer to be released */
+/* This id would have been returned earlier by */
+/* the codec */
+/* Globals : */
+/* Processing : Calls Release Display call */
+/* */
+/* Outputs : */
+/* Returns : Status of release display call */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T release_disp_frame(void *codec_obj, UWORD32 buf_id)
+{
+ ivd_rel_display_frame_ip_t s_video_rel_disp_ip;
+ ivd_rel_display_frame_op_t s_video_rel_disp_op;
+ IV_API_CALL_STATUS_T e_dec_status;
+
+ s_video_rel_disp_ip.e_cmd = IVD_CMD_REL_DISPLAY_FRAME;
+ s_video_rel_disp_ip.u4_size = sizeof(ivd_rel_display_frame_ip_t);
+ s_video_rel_disp_op.u4_size = sizeof(ivd_rel_display_frame_op_t);
+ s_video_rel_disp_ip.u4_disp_buf_id = buf_id;
+
+ e_dec_status = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_rel_disp_ip,
+ (void *)&s_video_rel_disp_op);
+ if(IV_SUCCESS != e_dec_status)
+ {
+ printf("Error in Release Disp frame\n");
+ }
+
+
+ return (e_dec_status);
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : get_version */
+/* */
+/* Description : Control call to get codec version */
+/* */
+/* */
+/* Inputs : codec_obj : Codec handle */
+/* Globals : */
+/* Processing : Calls enable skip B frames control */
+/* */
+/* Outputs : */
+/* Returns : Control call return status */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+IV_API_CALL_STATUS_T get_version(void *codec_obj)
+{
+ ivd_ctl_getversioninfo_ip_t s_ctl_dec_ip;
+ ivd_ctl_getversioninfo_op_t s_ctl_dec_op;
+ UWORD8 au1_buf[512];
+ IV_API_CALL_STATUS_T status;
+ s_ctl_dec_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_dec_ip.e_sub_cmd = IVD_CMD_CTL_GETVERSION;
+ s_ctl_dec_ip.u4_size = sizeof(ivd_ctl_getversioninfo_ip_t);
+ s_ctl_dec_op.u4_size = sizeof(ivd_ctl_getversioninfo_op_t);
+ s_ctl_dec_ip.pv_version_buffer = au1_buf;
+ s_ctl_dec_ip.u4_version_buffer_size = sizeof(au1_buf);
+
+ status = ivd_cxa_api_function((iv_obj_t *)codec_obj,
+ (void *)&(s_ctl_dec_ip),
+ (void *)&(s_ctl_dec_op));
+
+ if(status != IV_SUCCESS)
+ {
+ printf("Error in Getting Version number e_dec_status = %d u4_error_code = %x\n",
+ status, s_ctl_dec_op.u4_error_code);
+ }
+ else
+ {
+ printf("Ittiam Decoder Version number: %s\n",
+ (char *)s_ctl_dec_ip.pv_version_buffer);
+ }
+ return status;
+}
+/*****************************************************************************/
+/* */
+/* Function Name : codec_exit */
+/* */
+/* Description : handles unrecoverable errors */
+/* Inputs : Error message */
+/* Globals : None */
+/* Processing : Prints error message to console and exits. */
+/* Outputs : Error mesage to the console */
+/* Returns : None */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 07 06 2006 Sankar Creation */
+/* */
+/*****************************************************************************/
+void codec_exit(CHAR *pc_err_message)
+{
+ printf("%s\n", pc_err_message);
+ exit(-1);
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : dump_output */
+/* */
+/* Description : Used to dump output YUV */
+/* Inputs : App context, disp output desc, File pointer */
+/* Globals : None */
+/* Processing : Dumps to a file */
+/* Returns : None */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 07 06 2006 Sankar Creation */
+/* */
+/*****************************************************************************/
+void dump_output(vid_dec_ctx_t *ps_app_ctx,
+ iv_yuv_buf_t *ps_disp_frm_buf,
+ UWORD32 u4_disp_frm_id,
+ FILE *ps_op_file,
+ FILE *ps_op_chksum_file,
+ WORD32 i4_op_frm_ts,
+ UWORD32 file_save,
+ UWORD32 chksum_save)
+
+{
+
+ UWORD32 i;
+ iv_yuv_buf_t s_dump_disp_frm_buf;
+ UWORD32 u4_disp_id;
+
+ memset(&s_dump_disp_frm_buf, 0, sizeof(iv_yuv_buf_t));
+
+ if(ps_app_ctx->share_disp_buf)
+ {
+ if(ps_app_ctx->dump_q_wr_idx == MAX_DISP_BUFFERS
+ )
+ ps_app_ctx->dump_q_wr_idx = 0;
+
+ if(ps_app_ctx->dump_q_rd_idx == MAX_DISP_BUFFERS
+ )
+ ps_app_ctx->dump_q_rd_idx = 0;
+
+ ps_app_ctx->s_disp_frm_queue[ps_app_ctx->dump_q_wr_idx] =
+ *ps_disp_frm_buf;
+ ps_app_ctx->s_disp_frm_id_queue[ps_app_ctx->dump_q_wr_idx] =
+ u4_disp_frm_id;
+ ps_app_ctx->dump_q_wr_idx++;
+
+ if((WORD32)i4_op_frm_ts >= (WORD32)(ps_app_ctx->disp_delay - 1))
+ {
+ s_dump_disp_frm_buf =
+ ps_app_ctx->s_disp_frm_queue[ps_app_ctx->dump_q_rd_idx];
+ u4_disp_id =
+ ps_app_ctx->s_disp_frm_id_queue[ps_app_ctx->dump_q_rd_idx];
+ ps_app_ctx->dump_q_rd_idx++;
+ }
+ else
+ {
+ return;
+ }
+ }
+ else
+ {
+ s_dump_disp_frm_buf = *ps_disp_frm_buf;
+ u4_disp_id = u4_disp_frm_id;
+ }
+
+ release_disp_frame(ps_app_ctx->cocodec_obj, u4_disp_id);
+
+ if(0 == file_save && 0 == chksum_save)
+ return;
+
+ if(NULL == s_dump_disp_frm_buf.pv_y_buf)
+ return;
+
+ if(ps_app_ctx->e_output_chroma_format == IV_YUV_420P)
+ {
+#if DUMP_SINGLE_BUF
+ {
+ UWORD8 *buf = s_dump_disp_frm_buf.pv_y_buf - 24 - (s_dump_disp_frm_buf.u4_y_strd * 40);
+
+ UWORD32 size = s_dump_disp_frm_buf.u4_y_strd * ((s_dump_disp_frm_buf.u4_y_ht + 80) + (s_dump_disp_frm_buf.u4_u_ht + 40));
+ fwrite(buf, 1, size, ps_op_file);
+
+ }
+#else
+ if(0 != file_save)
+ {
+ UWORD8 *buf;
+
+ buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+ for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+ {
+ fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_wd, ps_op_file);
+ buf += s_dump_disp_frm_buf.u4_y_strd;
+ }
+
+ buf = (UWORD8 *)s_dump_disp_frm_buf.pv_u_buf;
+ for(i = 0; i < s_dump_disp_frm_buf.u4_u_ht; i++)
+ {
+ fwrite(buf, 1, s_dump_disp_frm_buf.u4_u_wd, ps_op_file);
+ buf += s_dump_disp_frm_buf.u4_u_strd;
+ }
+ buf = (UWORD8 *)s_dump_disp_frm_buf.pv_v_buf;
+ for(i = 0; i < s_dump_disp_frm_buf.u4_v_ht; i++)
+ {
+ fwrite(buf, 1, s_dump_disp_frm_buf.u4_v_wd, ps_op_file);
+ buf += s_dump_disp_frm_buf.u4_v_strd;
+ }
+
+ }
+
+ if(0 != chksum_save)
+ {
+ UWORD8 au1_y_chksum[16];
+ UWORD8 au1_u_chksum[16];
+ UWORD8 au1_v_chksum[16];
+ calc_md5_cksum((UWORD8 *)s_dump_disp_frm_buf.pv_y_buf,
+ s_dump_disp_frm_buf.u4_y_strd,
+ s_dump_disp_frm_buf.u4_y_wd,
+ s_dump_disp_frm_buf.u4_y_ht,
+ au1_y_chksum);
+ calc_md5_cksum((UWORD8 *)s_dump_disp_frm_buf.pv_u_buf,
+ s_dump_disp_frm_buf.u4_u_strd,
+ s_dump_disp_frm_buf.u4_u_wd,
+ s_dump_disp_frm_buf.u4_u_ht,
+ au1_u_chksum);
+ calc_md5_cksum((UWORD8 *)s_dump_disp_frm_buf.pv_v_buf,
+ s_dump_disp_frm_buf.u4_v_strd,
+ s_dump_disp_frm_buf.u4_v_wd,
+ s_dump_disp_frm_buf.u4_v_ht,
+ au1_v_chksum);
+
+ fwrite(au1_y_chksum, sizeof(UWORD8), 16, ps_op_chksum_file);
+ fwrite(au1_u_chksum, sizeof(UWORD8), 16, ps_op_chksum_file);
+ fwrite(au1_v_chksum, sizeof(UWORD8), 16, ps_op_chksum_file);
+ }
+#endif
+ }
+ else if((ps_app_ctx->e_output_chroma_format == IV_YUV_420SP_UV)
+ || (ps_app_ctx->e_output_chroma_format == IV_YUV_420SP_VU))
+ {
+#if DUMP_SINGLE_BUF
+ {
+
+ UWORD8 *buf = s_dump_disp_frm_buf.pv_y_buf - 24 - (s_dump_disp_frm_buf.u4_y_strd * 40);
+
+ UWORD32 size = s_dump_disp_frm_buf.u4_y_strd * ((s_dump_disp_frm_buf.u4_y_ht + 80) + (s_dump_disp_frm_buf.u4_u_ht + 40));
+ fwrite(buf, 1, size, ps_op_file);
+ }
+#else
+ {
+ UWORD8 *buf;
+
+ buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+ for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+ {
+ fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_wd, ps_op_file);
+ buf += s_dump_disp_frm_buf.u4_y_strd;
+ }
+
+ buf = (UWORD8 *)s_dump_disp_frm_buf.pv_u_buf;
+ for(i = 0; i < s_dump_disp_frm_buf.u4_u_ht; i++)
+ {
+ fwrite(buf, 1, s_dump_disp_frm_buf.u4_u_wd, ps_op_file);
+ buf += s_dump_disp_frm_buf.u4_u_strd;
+ }
+ }
+#endif
+ }
+ else if(ps_app_ctx->e_output_chroma_format == IV_RGBA_8888)
+ {
+ UWORD8 *buf;
+
+ buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+ for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+ {
+ fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_wd * 4, ps_op_file);
+ buf += s_dump_disp_frm_buf.u4_y_strd * 4;
+ }
+ }
+ else
+ {
+ UWORD8 *buf;
+
+ buf = (UWORD8 *)s_dump_disp_frm_buf.pv_y_buf;
+ for(i = 0; i < s_dump_disp_frm_buf.u4_y_ht; i++)
+ {
+ fwrite(buf, 1, s_dump_disp_frm_buf.u4_y_strd * 2, ps_op_file);
+ buf += s_dump_disp_frm_buf.u4_y_strd * 2;
+ }
+ }
+
+ fflush(ps_op_file);
+ fflush(ps_op_chksum_file);
+
+}
+
+
+/*****************************************************************************/
+/* */
+/* Function Name : print_usage */
+/* */
+/* Description : Prints argument format */
+/* */
+/* */
+/* Inputs : */
+/* Globals : */
+/* Processing : Prints argument format */
+/* */
+/* Outputs : */
+/* Returns : */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+void print_usage(void)
+{
+ WORD32 i = 0;
+ WORD32 num_entries = sizeof(argument_mapping) / sizeof(argument_t);
+ printf("\nUsage:\n");
+ while(i < num_entries)
+ {
+ printf("%-32s\t %s", argument_mapping[i].argument_name,
+ argument_mapping[i].description);
+ i++;
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : get_argument */
+/* */
+/* Description : Gets argument for a given string */
+/* */
+/* */
+/* Inputs : name */
+/* Globals : */
+/* Processing : Searches the given string in the array and returns */
+/* appropriate argument ID */
+/* */
+/* Outputs : Argument ID */
+/* Returns : Argument ID */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+ARGUMENT_T get_argument(CHAR *name)
+{
+ WORD32 i = 0;
+ WORD32 num_entries = sizeof(argument_mapping) / sizeof(argument_t);
+ while(i < num_entries)
+ {
+ if((0 == strcmp(argument_mapping[i].argument_name, name)) ||
+ ((0 == strcmp(argument_mapping[i].argument_shortname, name)) &&
+ (0 != strcmp(argument_mapping[i].argument_shortname, "--"))))
+ {
+ return argument_mapping[i].argument;
+ }
+ i++;
+ }
+ return INVALID;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : get_argument */
+/* */
+/* Description : Gets argument for a given string */
+/* */
+/* */
+/* Inputs : name */
+/* Globals : */
+/* Processing : Searches the given string in the array and returns */
+/* appropriate argument ID */
+/* */
+/* Outputs : Argument ID */
+/* Returns : Argument ID */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+void parse_argument(vid_dec_ctx_t *ps_app_ctx, CHAR *argument, CHAR *value)
+{
+ ARGUMENT_T arg;
+
+ arg = get_argument(argument);
+ switch(arg)
+ {
+ case HELP:
+ print_usage();
+ exit(-1);
+ case VERSION:
+ break;
+ case INPUT_FILE:
+ sscanf(value, "%s", ps_app_ctx->ac_ip_fname);
+ //input_passed = 1;
+ break;
+
+ case OUTPUT:
+ sscanf(value, "%s", ps_app_ctx->ac_op_fname);
+ break;
+
+ case CHKSUM:
+ sscanf(value, "%s", ps_app_ctx->ac_op_chksum_fname);
+ break;
+
+ case SAVE_OUTPUT:
+ sscanf(value, "%d", &ps_app_ctx->u4_file_save_flag);
+ break;
+
+ case SAVE_CHKSUM:
+ sscanf(value, "%d", &ps_app_ctx->u4_chksum_save_flag);
+ break;
+
+ case CHROMA_FORMAT:
+ if((strcmp(value, "YUV_420P")) == 0)
+ ps_app_ctx->e_output_chroma_format = IV_YUV_420P;
+ else if((strcmp(value, "YUV_422ILE")) == 0)
+ ps_app_ctx->e_output_chroma_format = IV_YUV_422ILE;
+ else if((strcmp(value, "RGB_565")) == 0)
+ ps_app_ctx->e_output_chroma_format = IV_RGB_565;
+ else if((strcmp(value, "RGBA_8888")) == 0)
+ ps_app_ctx->e_output_chroma_format = IV_RGBA_8888;
+ else if((strcmp(value, "YUV_420SP_UV")) == 0)
+ ps_app_ctx->e_output_chroma_format = IV_YUV_420SP_UV;
+ else if((strcmp(value, "YUV_420SP_VU")) == 0)
+ ps_app_ctx->e_output_chroma_format = IV_YUV_420SP_VU;
+ else
+ {
+ printf("\nInvalid colour format setting it to IV_YUV_420P\n");
+ ps_app_ctx->e_output_chroma_format = IV_YUV_420P;
+ }
+
+ break;
+ case NUM_FRAMES:
+ sscanf(value, "%d", &ps_app_ctx->u4_max_frm_ts);
+ break;
+
+ case NUM_CORES:
+ sscanf(value, "%d", &ps_app_ctx->u4_num_cores);
+ break;
+ case DEGRADE_PICS:
+ sscanf(value, "%d", &ps_app_ctx->i4_degrade_pics);
+ break;
+ case DEGRADE_TYPE:
+ sscanf(value, "%d", &ps_app_ctx->i4_degrade_type);
+ break;
+ case SHARE_DISPLAY_BUF:
+ sscanf(value, "%d", &ps_app_ctx->share_disp_buf);
+ break;
+ case LOOPBACK:
+ sscanf(value, "%d", &ps_app_ctx->loopback);
+ break;
+ case DISPLAY:
+#if defined(SDL_DISPLAY) || defined(FBDEV_DISPLAY) || defined(INTEL_CE5300) || defined(IOS_DISPLAY)
+ sscanf(value, "%d", &ps_app_ctx->display);
+#else
+ ps_app_ctx->display = 0;
+#endif
+ break;
+ case FULLSCREEN:
+ sscanf(value, "%d", &ps_app_ctx->full_screen);
+ break;
+ case FPS:
+ sscanf(value, "%d", &ps_app_ctx->fps);
+ if(ps_app_ctx->fps <= 0)
+ ps_app_ctx->fps = DEFAULT_FPS;
+ break;
+ case MAX_WD:
+ sscanf(value, "%d", &ps_app_ctx->max_wd);
+ break;
+ case MAX_HT:
+ sscanf(value, "%d", &ps_app_ctx->max_ht);
+ break;
+ case MAX_LEVEL:
+ sscanf(value, "%d", &ps_app_ctx->max_level);
+ break;
+ case ARCH:
+ if((strcmp(value, "ARM_NONEON")) == 0)
+ ps_app_ctx->e_arch = ARCH_ARM_NONEON;
+ else if((strcmp(value, "ARM_A9Q")) == 0)
+ ps_app_ctx->e_arch = ARCH_ARM_A9Q;
+ else if((strcmp(value, "ARM_A7")) == 0)
+ ps_app_ctx->e_arch = ARCH_ARM_A7;
+ else if((strcmp(value, "ARM_A5")) == 0)
+ ps_app_ctx->e_arch = ARCH_ARM_A5;
+ else if((strcmp(value, "ARM_NEONINTR")) == 0)
+ ps_app_ctx->e_arch = ARCH_ARM_NEONINTR;
+ else if((strcmp(value, "X86_GENERIC")) == 0)
+ ps_app_ctx->e_arch = ARCH_X86_GENERIC;
+ else if((strcmp(value, "X86_SSSE3")) == 0)
+ ps_app_ctx->e_arch = ARCH_X86_SSSE3;
+ else if((strcmp(value, "X86_SSE42")) == 0)
+ ps_app_ctx->e_arch = ARCH_X86_SSE42;
+ else if((strcmp(value, "X86_AVX2")) == 0)
+ ps_app_ctx->e_arch = ARCH_X86_AVX2;
+ else if((strcmp(value, "MIPS_GENERIC")) == 0)
+ ps_app_ctx->e_arch = ARCH_MIPS_GENERIC;
+ else if((strcmp(value, "MIPS_32")) == 0)
+ ps_app_ctx->e_arch = ARCH_MIPS_32;
+ else
+ {
+ printf("\nInvalid Arch. Setting it to ARM_A9Q\n");
+ ps_app_ctx->e_arch = ARCH_ARM_A9Q;
+ }
+
+ break;
+ case SOC:
+ if((strcmp(value, "GENERIC")) == 0)
+ ps_app_ctx->e_soc = SOC_GENERIC;
+ else if((strcmp(value, "HISI_37X")) == 0)
+ ps_app_ctx->e_soc = SOC_HISI_37X;
+ else
+ {
+ ps_app_ctx->e_soc = atoi(value);
+/*
+ printf("\nInvalid SOC. Setting it to GENERIC\n");
+ ps_app_ctx->e_soc = SOC_GENERIC;
+*/
+ }
+ break;
+ case PICLEN:
+ sscanf(value, "%d", &ps_app_ctx->u4_piclen_flag);
+ break;
+
+ case PICLEN_FILE:
+ sscanf(value, "%s", ps_app_ctx->ac_piclen_fname);
+ break;
+
+#ifdef GPU_BUILD
+ case ENABLE_GPU:
+ sscanf(value, "%d", &ps_app_ctx->u4_gpu_enable_diable);
+ break;
+#endif
+ case INVALID:
+ default:
+ printf("Ignoring argument : %s\n", argument);
+ break;
+ }
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : read_cfg_file */
+/* */
+/* Description : Reads arguments from a configuration file */
+/* */
+/* */
+/* Inputs : ps_app_ctx : Application context */
+/* fp_cfg_file : Configuration file handle */
+/* Globals : */
+/* Processing : Parses the arguments and fills in the application context*/
+/* */
+/* Outputs : Arguments parsed */
+/* Returns : None */
+/* */
+/* Issues : */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* */
+/*****************************************************************************/
+
+void read_cfg_file(vid_dec_ctx_t *ps_app_ctx, FILE *fp_cfg_file)
+{
+
+ CHAR line[STRLENGTH];
+ CHAR description[STRLENGTH];
+ CHAR value[STRLENGTH];
+ CHAR argument[STRLENGTH];
+ void *ret;
+ while(0 == feof(fp_cfg_file))
+ {
+ line[0] = '\0';
+ ret = fgets(line, STRLENGTH, fp_cfg_file);
+ if(NULL == ret)
+ break;
+ argument[0] = '\0';
+ /* Reading Input File Name */
+ sscanf(line, "%s %s %s", argument, value, description);
+ if(argument[0] == '\0')
+ continue;
+
+ parse_argument(ps_app_ctx, argument, value);
+ }
+
+
+}
+
+/*!
+**************************************************************************
+* \if Function name : dispq_producer_dequeue \endif
+*
+* \brief
+* This function gets a free buffer index where display data can be written
+* This is a blocking call and can be exited by setting quit to true in
+* the application context
+*
+* \param[in] ps_app_ctx : Pointer to application context
+*
+* \return
+* returns Next free buffer index for producer
+*
+* \author
+* Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_producer_dequeue(vid_dec_ctx_t *ps_app_ctx)
+{
+ WORD32 idx;
+
+ /* If there is no free buffer wait */
+
+ while(((ps_app_ctx->disp_q_wr_idx + 1) % NUM_DISPLAY_BUFFERS) == ps_app_ctx->disp_q_rd_idx)
+ {
+
+ ithread_msleep(1);
+
+ if(ps_app_ctx->quit)
+ return (-1);
+ }
+
+ idx = ps_app_ctx->disp_q_wr_idx;
+ return (idx);
+}
+
+/*!
+**************************************************************************
+* \if Function name : dispq_producer_queue \endif
+*
+* \brief
+* This function adds buffer which can be displayed
+*
+* \param[in] ps_app_ctx : Pointer to application context
+*
+* \return
+* returns Next free buffer index for producer
+*
+* \author
+* Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_producer_queue(vid_dec_ctx_t *ps_app_ctx)
+{
+ ps_app_ctx->disp_q_wr_idx++;
+ if(ps_app_ctx->disp_q_wr_idx == NUM_DISPLAY_BUFFERS)
+ ps_app_ctx->disp_q_wr_idx = 0;
+
+ return (0);
+}
+/*!
+**************************************************************************
+* \if Function name : dispq_consumer_dequeue \endif
+*
+* \brief
+* This function gets a free buffer index where display data can be written
+* This is a blocking call and can be exited by setting quit to true in
+* the application context
+*
+* \param[in] ps_app_ctx : Pointer to application context
+*
+* \return
+* returns Next free buffer index for producer
+*
+* \author
+* Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_consumer_dequeue(vid_dec_ctx_t *ps_app_ctx)
+{
+ WORD32 idx;
+
+ /* If there is no free buffer wait */
+
+ while(ps_app_ctx->disp_q_wr_idx == ps_app_ctx->disp_q_rd_idx)
+ {
+
+ ithread_msleep(1);
+
+ if(ps_app_ctx->quit)
+ return (-1);
+ }
+
+ idx = ps_app_ctx->disp_q_rd_idx;
+ return (idx);
+}
+
+/*!
+**************************************************************************
+* \if Function name : dispq_producer_queue \endif
+*
+* \brief
+* This function adds buffer which can be displayed
+*
+* \param[in] ps_app_ctx : Pointer to application context
+*
+* \return
+* returns Next free buffer index for producer
+*
+* \author
+* Ittiam
+*
+**************************************************************************
+*/
+WORD32 dispq_consumer_queue(vid_dec_ctx_t *ps_app_ctx)
+{
+ ps_app_ctx->disp_q_rd_idx++;
+ if(ps_app_ctx->disp_q_rd_idx == NUM_DISPLAY_BUFFERS)
+ ps_app_ctx->disp_q_rd_idx = 0;
+
+ return (0);
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : display_thread */
+/* */
+/* Description : Thread to display the frame */
+/* */
+/* */
+/* Inputs : pv_ctx : Application context */
+/* */
+/* Globals : */
+/* Processing : Wait for a buffer to get produced by decoder and display */
+/* that frame */
+/* */
+/* Outputs : */
+/* Returns : None */
+/* */
+/* Issues : Pause followed by quit is making some deadlock condn */
+/* If decoder was lagging initially and then fasten up, */
+/* display will also go at faster rate till it reaches */
+/* equilibrium wrt the initial time */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 05 2013 100578 Initial Version */
+/* */
+/*****************************************************************************/
+
+WORD32 display_thread(void *pv_ctx)
+{
+ vid_dec_ctx_t *ps_app_ctx = (vid_dec_ctx_t *)pv_ctx;
+
+
+ UWORD32 frm_duration; /* in us */
+ UWORD32 current_time;
+ UWORD32 expected_time;
+ TIMER s_end_timer;
+ TIMER s_first_frame_time;
+ UWORD32 first_frame_displayed;
+
+#ifdef X86_MINGW
+ UWORD32 frequency = 0;
+#endif
+#ifdef X86_MSVC
+ TIMER frequency;
+#endif
+
+#ifdef X86_MSVC
+ QueryPerformanceFrequency(&frequency);
+#endif
+ first_frame_displayed = 0;
+ expected_time = 0;
+ frm_duration = 1000000 / ps_app_ctx->fps;
+
+ /* Init display and allocate display buffers */
+ ps_app_ctx->pv_disp_ctx = (void *)ps_app_ctx->disp_init(ps_app_ctx->u4_pic_wd,
+ ps_app_ctx->u4_pic_ht,
+ ps_app_ctx->i4_screen_wd,
+ ps_app_ctx->i4_screen_ht,
+ ps_app_ctx->max_wd,
+ ps_app_ctx->max_ht,
+ ps_app_ctx->full_screen,
+ &ps_app_ctx->quit,
+ &ps_app_ctx->paused);
+ ps_app_ctx->alloc_disp_buffers(ps_app_ctx->pv_disp_ctx);
+
+ ps_app_ctx->display_init_done = 1;
+
+ while(1)
+ {
+ WORD32 rd_idx;
+
+ rd_idx = dispq_consumer_dequeue(ps_app_ctx);
+ if(ps_app_ctx->quit)
+ break;
+
+ ps_app_ctx->display_buffer(ps_app_ctx->pv_disp_ctx, rd_idx);
+
+ if(0 == first_frame_displayed)
+ {
+ GETTIME(&s_first_frame_time);
+ first_frame_displayed = 1;
+ }
+
+ /*********************************************************************/
+ /* Sleep based on the expected time of arrival of current buffer and */
+ /* the Current frame */
+ /*********************************************************************/
+
+ GETTIME(&s_end_timer);
+ ELAPSEDTIME(s_first_frame_time, s_end_timer, current_time, frequency);
+
+ /* time in micro second */
+ expected_time += frm_duration;
+
+ //printf("current_time %d expected_time %d diff %d \n", current_time, expected_time, (expected_time - current_time));
+ /* sleep for the diff. in time */
+ if(current_time < expected_time)
+ ps_app_ctx->disp_usleep((expected_time - current_time));
+ else
+ expected_time += (current_time - expected_time);
+
+ dispq_consumer_queue(ps_app_ctx);
+
+ }
+
+
+ while(0 == ps_app_ctx->display_deinit_flag)
+ {
+ ps_app_ctx->disp_usleep(1000);
+ }
+ ps_app_ctx->disp_deinit(ps_app_ctx->pv_disp_ctx);
+
+ /* destroy the display thread */
+ ithread_exit(ps_app_ctx->display_thread_handle);
+
+ return 0;
+}
+
+void flush_output(iv_obj_t *codec_obj,
+ vid_dec_ctx_t *ps_app_ctx,
+ ivd_out_bufdesc_t *ps_out_buf,
+ UWORD8 *pu1_bs_buf,
+ UWORD32 *pu4_op_frm_ts,
+ FILE *ps_op_file,
+ FILE *ps_op_chksum_file,
+ UWORD32 u4_ip_frm_ts,
+ UWORD32 u4_bytes_remaining)
+{
+ WORD32 ret;
+
+ do
+ {
+
+ ivd_ctl_flush_ip_t s_ctl_ip;
+ ivd_ctl_flush_op_t s_ctl_op;
+
+ if(*pu4_op_frm_ts >= (ps_app_ctx->u4_max_frm_ts + ps_app_ctx->disp_delay))
+ break;
+
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_FLUSH;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_flush_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_flush_op_t);
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+
+ if(ret != IV_SUCCESS)
+ {
+ printf("Error in Setting the decoder in flush mode\n");
+ }
+
+ if(IV_SUCCESS == ret)
+ {
+ ivd_video_decode_ip_t s_video_decode_ip;
+ ivd_video_decode_op_t s_video_decode_op;
+
+ s_video_decode_ip.e_cmd = IVD_CMD_VIDEO_DECODE;
+ s_video_decode_ip.u4_ts = u4_ip_frm_ts;
+ s_video_decode_ip.pv_stream_buffer = pu1_bs_buf;
+ s_video_decode_ip.u4_num_Bytes = u4_bytes_remaining;
+ s_video_decode_ip.u4_size = sizeof(ivd_video_decode_ip_t);
+ s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[0] =
+ ps_out_buf->u4_min_out_buf_size[0];
+ s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[1] =
+ ps_out_buf->u4_min_out_buf_size[1];
+ s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[2] =
+ ps_out_buf->u4_min_out_buf_size[2];
+
+ s_video_decode_ip.s_out_buffer.pu1_bufs[0] =
+ ps_out_buf->pu1_bufs[0];
+ s_video_decode_ip.s_out_buffer.pu1_bufs[1] =
+ ps_out_buf->pu1_bufs[1];
+ s_video_decode_ip.s_out_buffer.pu1_bufs[2] =
+ ps_out_buf->pu1_bufs[2];
+ s_video_decode_ip.s_out_buffer.u4_num_bufs =
+ ps_out_buf->u4_num_bufs;
+
+ s_video_decode_op.u4_size = sizeof(ivd_video_decode_op_t);
+
+ /*****************************************************************************/
+ /* API Call: Video Decode */
+ /*****************************************************************************/
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_decode_ip,
+ (void *)&s_video_decode_op);
+
+ if(1 == s_video_decode_op.u4_output_present)
+ {
+ dump_output(ps_app_ctx, &(s_video_decode_op.s_disp_frm_buf),
+ s_video_decode_op.u4_disp_buf_id, ps_op_file,
+ ps_op_chksum_file,
+ *pu4_op_frm_ts, ps_app_ctx->u4_file_save_flag,
+ ps_app_ctx->u4_chksum_save_flag);
+
+ (*pu4_op_frm_ts)++;
+ }
+ }
+ }while(IV_SUCCESS == ret);
+
+}
+
+#ifdef X86_MINGW
+void sigsegv_handler()
+{
+ printf("Segmentation fault, Exiting.. \n");
+ exit(-1);
+}
+#endif
+
+UWORD32 default_get_stride(void)
+{
+ return 0;
+}
+
+
+IV_COLOR_FORMAT_T default_get_color_fmt(void)
+{
+ return IV_YUV_420P;
+}
+/*****************************************************************************/
+/* */
+/* Function Name : main */
+/* */
+/* Description : Application to demonstrate codec API */
+/* */
+/* */
+/* Inputs : argc - Number of arguments */
+/* argv[] - Arguments */
+/* Globals : */
+/* Processing : Shows how to use create, process, control and delete */
+/* */
+/* Outputs : Codec output in a file */
+/* Returns : */
+/* */
+/* Issues : Assumes both PROFILE_ENABLE to be */
+/* defined for multithread decode-display working */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 100189 Initial Version */
+/* 09 05 2013 100578 Multithread decode-display */
+/*****************************************************************************/
+#ifdef IOS
+int hevcdec_main(char *homedir, char *documentdir, int screen_wd, int screen_ht)
+#else
+int main(WORD32 argc, CHAR *argv[])
+#endif
+{
+ CHAR ac_cfg_fname[STRLENGTH];
+ FILE *fp_cfg_file = NULL;
+ FILE *ps_piclen_file = NULL;
+ FILE *ps_ip_file = NULL;
+ FILE *ps_op_file = NULL;
+ FILE *ps_op_chksum_file = NULL;
+ WORD32 ret;
+ CHAR ac_error_str[STRLENGTH];
+ vid_dec_ctx_t s_app_ctx;
+ UWORD8 *pu1_bs_buf;
+
+ ivd_out_bufdesc_t *ps_out_buf;
+ UWORD32 u4_num_bytes_dec = 0;
+ UWORD32 file_pos = 0;
+ IV_API_CALL_STATUS_T e_dec_status;
+ UWORD32 u4_ip_frm_ts = 0, u4_op_frm_ts = 0;
+
+ WORD32 u4_bytes_remaining = 0;
+ void *pv_mem_rec_location;
+ UWORD32 u4_num_mem_recs;
+ UWORD32 i;
+ UWORD32 u4_ip_buf_len;
+ UWORD32 frm_cnt = 0;
+ WORD32 total_bytes_comsumed;
+
+#ifdef PROFILE_ENABLE
+ UWORD32 u4_tot_cycles = 0;
+ UWORD32 u4_tot_fmt_cycles = 0;
+ UWORD32 peak_window[PEAK_WINDOW_SIZE];
+ UWORD32 peak_window_idx = 0;
+ UWORD32 peak_avg_max = 0;
+#ifdef INTEL_CE5300
+ UWORD32 time_consumed = 0;
+ UWORD32 bytes_consumed = 0;
+#endif
+#endif
+
+#ifdef X86_MINGW
+ UWORD32 frequency = 0;
+#endif
+#ifdef X86_MSVC
+ TIMER frequency;
+#endif
+ WORD32 width = 0, height = 0;
+ iv_obj_t *codec_obj;
+#if defined(GPU_BUILD) && !defined(X86)
+// int ioctl_init();
+// ioctl_init();
+#endif
+
+#ifdef X86_MINGW
+ //For getting printfs without any delay
+ setvbuf(stdout, NULL, _IONBF, 0);
+ setvbuf(stderr, NULL, _IONBF, 0);
+#endif
+#ifdef IOS
+ sprintf(filename_trace, "%s/iostrace.txt", homedir);
+ printf("\ntrace file name = %s", filename_trace);
+#endif
+
+#ifdef X86_MINGW
+ {
+ signal(SIGSEGV, sigsegv_handler);
+ }
+#endif
+
+
+#ifndef IOS
+ /* Usage */
+ if(argc < 2)
+ {
+ printf("Using test.cfg as configuration file \n");
+ strcpy(ac_cfg_fname, "test.cfg");
+ }
+ else if(argc == 2)
+ {
+ strcpy(ac_cfg_fname, argv[1]);
+ }
+
+#else
+ strcpy(ac_cfg_fname, "test.cfg");
+
+#endif
+
+
+ /***********************************************************************/
+ /* Initialize Application parameters */
+ /***********************************************************************/
+
+ strcpy(s_app_ctx.ac_ip_fname, "\0");
+ s_app_ctx.dump_q_wr_idx = 0;
+ s_app_ctx.dump_q_rd_idx = 0;
+ s_app_ctx.display_thread_created = 0;
+ s_app_ctx.disp_q_wr_idx = 0;
+ s_app_ctx.disp_q_rd_idx = 0;
+ s_app_ctx.disp_delay = 0;
+ s_app_ctx.loopback = 0;
+ s_app_ctx.display = 0;
+ s_app_ctx.full_screen = 0;
+ s_app_ctx.u4_piclen_flag = 0;
+ s_app_ctx.fps = DEFAULT_FPS;
+ file_pos = 0;
+ total_bytes_comsumed = 0;
+ u4_ip_frm_ts = 0;
+ u4_op_frm_ts = 0;
+#ifdef PROFILE_ENABLE
+ memset(peak_window, 0, sizeof(WORD32) * PEAK_WINDOW_SIZE);
+#endif
+ s_app_ctx.share_disp_buf = DEFAULT_SHARE_DISPLAY_BUF;
+ s_app_ctx.u4_num_cores = DEFAULT_NUM_CORES;
+ s_app_ctx.i4_degrade_type = 0;
+ s_app_ctx.i4_degrade_pics = 0;
+ s_app_ctx.max_wd = 0;
+ s_app_ctx.max_ht = 0;
+ s_app_ctx.max_level = 0;
+ s_app_ctx.e_arch = ARCH_ARM_A9Q;
+ s_app_ctx.e_soc = SOC_GENERIC;
+
+ s_app_ctx.u4_strd = STRIDE;
+
+ s_app_ctx.display_thread_handle = malloc(ithread_get_handle_size());
+ s_app_ctx.quit = 0;
+ s_app_ctx.paused = 0;
+ //s_app_ctx.u4_output_present = 0;
+
+#ifdef GPU_BUILD
+ s_app_ctx.u4_gpu_enable_diable = 0;
+#endif
+ s_app_ctx.get_stride = &default_get_stride;
+
+ s_app_ctx.get_color_fmt = &default_get_color_fmt;
+
+ /* Set function pointers for display */
+#ifdef SDL_DISPLAY
+ s_app_ctx.disp_init = &sdl_disp_init;
+ s_app_ctx.alloc_disp_buffers = &sdl_alloc_disp_buffers;
+ s_app_ctx.display_buffer = &sdl_display;
+ s_app_ctx.set_disp_buffers = &sdl_set_disp_buffers;
+ s_app_ctx.disp_deinit = &sdl_disp_deinit;
+ s_app_ctx.disp_usleep = &sdl_disp_usleep;
+ s_app_ctx.get_color_fmt = &sdl_get_color_fmt;
+ s_app_ctx.get_stride = &sdl_get_stride;
+#endif
+
+#ifdef FBDEV_DISPLAY
+ s_app_ctx.disp_init = &fbd_disp_init;
+ s_app_ctx.alloc_disp_buffers = &fbd_alloc_disp_buffers;
+ s_app_ctx.display_buffer = &fbd_display;
+ s_app_ctx.set_disp_buffers = &fbd_set_disp_buffers;
+ s_app_ctx.disp_deinit = &fbd_disp_deinit;
+ s_app_ctx.disp_usleep = &fbd_disp_usleep;
+ s_app_ctx.get_color_fmt = &fbd_get_color_fmt;
+ s_app_ctx.get_stride = &fbd_get_stride;
+#endif
+
+#ifdef INTEL_CE5300
+ s_app_ctx.disp_init = &gdl_disp_init;
+ s_app_ctx.alloc_disp_buffers = &gdl_alloc_disp_buffers;
+ s_app_ctx.display_buffer = &gdl_display;
+ s_app_ctx.set_disp_buffers = &gdl_set_disp_buffers;
+ s_app_ctx.disp_deinit = &gdl_disp_deinit;
+ s_app_ctx.disp_usleep = &gdl_disp_usleep;
+ s_app_ctx.get_color_fmt = &gdl_get_color_fmt;
+ s_app_ctx.get_stride = &gdl_get_stride;
+#endif
+
+#ifdef IOS_DISPLAY
+ s_app_ctx.disp_init = &ios_disp_init;
+ s_app_ctx.alloc_disp_buffers = &ios_alloc_disp_buffers;
+ s_app_ctx.display_buffer = &ios_display;
+ s_app_ctx.set_disp_buffers = &ios_set_disp_buffers;
+ s_app_ctx.disp_deinit = &ios_disp_deinit;
+ s_app_ctx.disp_usleep = &ios_disp_usleep;
+ s_app_ctx.get_color_fmt = &ios_get_color_fmt;
+ s_app_ctx.get_stride = &ios_get_stride;
+#endif
+
+ s_app_ctx.display_deinit_flag = 0;
+ s_app_ctx.e_output_chroma_format = IV_YUV_420SP_UV;
+ /*************************************************************************/
+ /* Parse arguments */
+ /*************************************************************************/
+
+#ifndef IOS
+ /* Read command line arguments */
+ if(argc > 2)
+ {
+ for(i = 1; i < (UWORD32)argc; i += 2)
+ {
+ if(CONFIG == get_argument(argv[i]))
+ {
+ strcpy(ac_cfg_fname, argv[i + 1]);
+ if((fp_cfg_file = fopen(ac_cfg_fname, "r")) == NULL)
+ {
+ sprintf(ac_error_str, "Could not open Configuration file %s",
+ ac_cfg_fname);
+ codec_exit(ac_error_str);
+ }
+ read_cfg_file(&s_app_ctx, fp_cfg_file);
+ fclose(fp_cfg_file);
+ }
+ else
+ {
+ parse_argument(&s_app_ctx, argv[i], argv[i + 1]);
+ }
+ }
+ }
+ else
+ {
+ if((fp_cfg_file = fopen(ac_cfg_fname, "r")) == NULL)
+ {
+ sprintf(ac_error_str, "Could not open Configuration file %s",
+ ac_cfg_fname);
+ codec_exit(ac_error_str);
+ }
+ read_cfg_file(&s_app_ctx, fp_cfg_file);
+ fclose(fp_cfg_file);
+ }
+#else
+ sprintf(filename_with_path, "%s/%s", homedir, ac_cfg_fname);
+ if((fp_cfg_file = fopen(filename_with_path, "r")) == NULL)
+ {
+ sprintf(ac_error_str, "Could not open Configuration file %s",
+ ac_cfg_fname);
+ codec_exit(ac_error_str);
+
+ }
+ read_cfg_file(&s_app_ctx, fp_cfg_file);
+ fclose(fp_cfg_file);
+
+#endif
+#ifdef PRINT_PICSIZE
+ /* If the binary is used for only getting number of bytes in each picture, then disable the following features */
+ s_app_ctx.u4_piclen_flag = 0;
+ s_app_ctx.u4_file_save_flag = 0;
+ s_app_ctx.u4_chksum_save_flag = 0;
+ s_app_ctx.i4_degrade_pics = 0;
+ s_app_ctx.i4_degrade_type = 0;
+ s_app_ctx.loopback = 0;
+ s_app_ctx.share_disp_buf = 0;
+ s_app_ctx.display = 0;
+#endif
+
+ /* If display is enabled, then turn off shared mode and get color format that is supported by display */
+ if(1 == s_app_ctx.display)
+ {
+ s_app_ctx.share_disp_buf = 0;
+ s_app_ctx.e_output_chroma_format = s_app_ctx.get_color_fmt();
+ }
+ if(strcmp(s_app_ctx.ac_ip_fname, "\0") == 0)
+ {
+ printf("\nNo input file given for decoding\n");
+ exit(-1);
+ }
+
+
+ /***********************************************************************/
+ /* create the file object for input file */
+ /***********************************************************************/
+#ifdef IOS
+ sprintf(filename_with_path, "%s/%s", homedir, s_app_ctx.ac_ip_fname);
+ ps_ip_file = fopen(filename_with_path, "rb");
+#else
+ ps_ip_file = fopen(s_app_ctx.ac_ip_fname, "rb");
+#endif
+ if(NULL == ps_ip_file)
+ {
+ sprintf(ac_error_str, "Could not open input file %s",
+ s_app_ctx.ac_ip_fname);
+ codec_exit(ac_error_str);
+ }
+ /***********************************************************************/
+ /* create the file object for input file */
+ /***********************************************************************/
+ if(1 == s_app_ctx.u4_piclen_flag)
+ {
+#ifdef IOS
+ sprintf(filename_with_path, "%s/%s", homedir, s_app_ctx.ac_piclen_fname);
+ ps_piclen_file = fopen(filename_with_path, "rb");
+#else
+ ps_piclen_file = fopen(s_app_ctx.ac_piclen_fname, "rb");
+#endif
+ if(NULL == ps_piclen_file)
+ {
+ sprintf(ac_error_str, "Could not open piclen file %s",
+ s_app_ctx.ac_piclen_fname);
+ codec_exit(ac_error_str);
+ }
+ }
+
+ /***********************************************************************/
+ /* create the file object for output file */
+ /***********************************************************************/
+ if(1 == s_app_ctx.u4_file_save_flag)
+ {
+#ifdef IOS
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctx.ac_op_fname);
+ ps_op_file = fopen(filename_with_path, "wb");
+#else
+ ps_op_file = fopen(s_app_ctx.ac_op_fname, "wb");
+#endif
+
+ if(NULL == ps_op_file)
+ {
+ sprintf(ac_error_str, "Could not open output file %s",
+ s_app_ctx.ac_op_fname);
+ codec_exit(ac_error_str);
+ }
+ }
+
+ /***********************************************************************/
+ /* create the file object for check sum file */
+ /***********************************************************************/
+ if(1 == s_app_ctx.u4_chksum_save_flag)
+ {
+#if IOS
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctx.ac_op_chksum_fname);
+ ps_op_chksum_file = fopen(filename_with_path, "wb");
+#else
+ ps_op_chksum_file = fopen(s_app_ctx.ac_op_chksum_fname, "wb");
+#endif
+ if(NULL == ps_op_chksum_file)
+ {
+ sprintf(ac_error_str, "Could not open check sum file %s",
+ s_app_ctx.ac_op_chksum_fname);
+ codec_exit(ac_error_str);
+ }
+ }
+ /***********************************************************************/
+ /* Create decoder instance */
+ /***********************************************************************/
+ {
+
+ ps_out_buf = (ivd_out_bufdesc_t *)malloc(sizeof(ivd_out_bufdesc_t));
+
+ {
+ iv_num_mem_rec_ip_t s_no_of_mem_rec_query_ip;
+ iv_num_mem_rec_op_t s_no_of_mem_rec_query_op;
+
+ s_no_of_mem_rec_query_ip.u4_size = sizeof(s_no_of_mem_rec_query_ip);
+ s_no_of_mem_rec_query_op.u4_size = sizeof(s_no_of_mem_rec_query_op);
+ s_no_of_mem_rec_query_ip.e_cmd = IV_CMD_GET_NUM_MEM_REC;
+
+ /*****************************************************************************/
+ /* API Call: Get Number of Mem Records */
+ /*****************************************************************************/
+ e_dec_status = ivd_cxa_api_function(
+ NULL, (void *)&s_no_of_mem_rec_query_ip,
+ (void *)&s_no_of_mem_rec_query_op);
+ if(IV_SUCCESS != e_dec_status)
+ {
+ sprintf(ac_error_str, "Error in get mem records");
+ codec_exit(ac_error_str);
+ }
+
+ u4_num_mem_recs = s_no_of_mem_rec_query_op.u4_num_mem_rec;
+ }
+
+ pv_mem_rec_location = malloc(u4_num_mem_recs * sizeof(iv_mem_rec_t));
+ if(pv_mem_rec_location == NULL)
+ {
+ sprintf(ac_error_str, "Allocation failure for mem_rec_location");
+ codec_exit(ac_error_str);
+
+ }
+
+ {
+ ihevcd_cxa_fill_mem_rec_ip_t s_fill_mem_rec_ip;
+ ihevcd_cxa_fill_mem_rec_op_t s_fill_mem_rec_op;
+ iv_mem_rec_t *ps_mem_rec;
+ UWORD32 total_size;
+
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.e_cmd =
+ IV_CMD_FILL_NUM_MEM_REC;
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.pv_mem_rec_location =
+ (iv_mem_rec_t *)pv_mem_rec_location;
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_wd =
+ (s_app_ctx.max_wd == 0) ? MAX_FRAME_WIDTH : s_app_ctx.max_wd;
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_max_frm_ht =
+ (s_app_ctx.max_ht == 0) ? MAX_FRAME_HEIGHT : s_app_ctx.max_ht;
+ s_fill_mem_rec_ip.i4_level = (s_app_ctx.max_level == 0) ? MAX_LEVEL_SUPPORTED : s_app_ctx.max_level;
+ s_fill_mem_rec_ip.u4_num_ref_frames = MAX_REF_FRAMES;
+ s_fill_mem_rec_ip.u4_num_reorder_frames = MAX_REORDER_FRAMES;
+ s_fill_mem_rec_ip.u4_share_disp_buf = s_app_ctx.share_disp_buf;
+ s_fill_mem_rec_ip.e_output_format =
+ (IV_COLOR_FORMAT_T)s_app_ctx.e_output_chroma_format;
+ s_fill_mem_rec_ip.u4_num_extra_disp_buf = EXTRA_DISP_BUFFERS;
+
+ s_fill_mem_rec_ip.s_ivd_fill_mem_rec_ip_t.u4_size =
+ sizeof(ihevcd_cxa_fill_mem_rec_ip_t);
+ s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_size =
+ sizeof(ihevcd_cxa_fill_mem_rec_op_t);
+
+ ps_mem_rec = (iv_mem_rec_t *)pv_mem_rec_location;
+ for(i = 0; i < u4_num_mem_recs; i++)
+ ps_mem_rec[i].u4_size = sizeof(iv_mem_rec_t);
+
+ /*****************************************************************************/
+ /* API Call: Fill Mem Records */
+ /*****************************************************************************/
+
+ e_dec_status = ivd_cxa_api_function(NULL,
+ (void *)&s_fill_mem_rec_ip,
+ (void *)&s_fill_mem_rec_op);
+
+ u4_num_mem_recs =
+ s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_num_mem_rec_filled;
+
+ if(IV_SUCCESS != e_dec_status)
+ {
+ sprintf(ac_error_str, "Error in fill mem records: %x", s_fill_mem_rec_op.s_ivd_fill_mem_rec_op_t.u4_error_code);
+ codec_exit(ac_error_str);
+ }
+
+ ps_mem_rec = (iv_mem_rec_t *)pv_mem_rec_location;
+ total_size = 0;
+ for(i = 0; i < u4_num_mem_recs; i++)
+ {
+ ps_mem_rec->pv_base = ihevca_aligned_malloc(ps_mem_rec->u4_mem_alignment,
+ ps_mem_rec->u4_mem_size);
+ if(ps_mem_rec->pv_base == NULL)
+ {
+ sprintf(ac_error_str,
+ "\nAllocation failure for mem record id %d size %d\n",
+ i, ps_mem_rec->u4_mem_size);
+ codec_exit(ac_error_str);
+
+ }
+ total_size += ps_mem_rec->u4_mem_size;
+
+ ps_mem_rec++;
+ }
+ //printf("\nTotal memory for codec %d\n", total_size);
+ }
+ /*****************************************************************************/
+ /* API Call: Initialize the Decoder */
+ /*****************************************************************************/
+ {
+ ihevcd_cxa_init_ip_t s_init_ip;
+ ihevcd_cxa_init_op_t s_init_op;
+ void *fxns = &ivd_cxa_api_function;
+ iv_mem_rec_t *mem_tab;
+
+ mem_tab = (iv_mem_rec_t *)pv_mem_rec_location;
+ s_init_ip.s_ivd_init_ip_t.e_cmd = (IVD_API_COMMAND_TYPE_T)IV_CMD_INIT;
+ s_init_ip.s_ivd_init_ip_t.pv_mem_rec_location = mem_tab;
+ s_init_ip.s_ivd_init_ip_t.u4_frm_max_wd = (s_app_ctx.max_wd == 0) ? MAX_FRAME_WIDTH : s_app_ctx.max_wd;
+ s_init_ip.s_ivd_init_ip_t.u4_frm_max_ht = (s_app_ctx.max_ht == 0) ? MAX_FRAME_HEIGHT : s_app_ctx.max_ht;
+ s_init_ip.i4_level = (s_app_ctx.max_level == 0) ? MAX_LEVEL_SUPPORTED : s_app_ctx.max_level;
+ s_init_ip.u4_num_ref_frames = MAX_REF_FRAMES;
+ s_init_ip.u4_num_reorder_frames = MAX_REORDER_FRAMES;
+ s_init_ip.u4_share_disp_buf = s_app_ctx.share_disp_buf;
+ s_init_ip.u4_num_extra_disp_buf = EXTRA_DISP_BUFFERS;
+ s_init_ip.s_ivd_init_ip_t.u4_num_mem_rec = u4_num_mem_recs;
+ s_init_ip.s_ivd_init_ip_t.e_output_format =
+ (IV_COLOR_FORMAT_T)s_app_ctx.e_output_chroma_format;
+ s_init_ip.s_ivd_init_ip_t.u4_size = sizeof(ihevcd_cxa_init_ip_t);
+ s_init_op.s_ivd_init_op_t.u4_size = sizeof(ihevcd_cxa_init_op_t);
+
+ codec_obj = (iv_obj_t *)mem_tab[0].pv_base;
+ codec_obj->pv_fxns = fxns;
+ codec_obj->u4_size = sizeof(iv_obj_t);
+
+ s_app_ctx.cocodec_obj = codec_obj;
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_init_ip,
+ (void *)&s_init_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "Error in Init %8x\n",
+ s_init_op.s_ivd_init_op_t.u4_error_code);
+ codec_exit(ac_error_str);
+ }
+
+ /*****************************************************************************/
+ /* Input and output buffer allocation */
+ /*****************************************************************************/
+ {
+
+ ivd_ctl_getbufinfo_ip_t s_ctl_ip;
+ ivd_ctl_getbufinfo_op_t s_ctl_op;
+
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_GETBUFINFO;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_getbufinfo_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_getbufinfo_op_t);
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "Error in Get Buf Info %x", s_ctl_op.u4_error_code);
+ codec_exit(ac_error_str);
+ }
+
+ /* Allocate input buffer */
+ u4_ip_buf_len = s_ctl_op.u4_min_in_buf_size[0];
+ pu1_bs_buf = (UWORD8 *)malloc(u4_ip_buf_len);
+
+ if(pu1_bs_buf == NULL)
+ {
+ sprintf(ac_error_str,
+ "\nAllocation failure for input buffer of size %d",
+ u4_ip_buf_len);
+ codec_exit(ac_error_str);
+ }
+ s_app_ctx.num_disp_buf = s_ctl_op.u4_num_disp_bufs;
+ /* Allocate output buffer only if display buffers are not shared */
+ /* Or if shared and output is 420P */
+ if((0 == s_app_ctx.share_disp_buf) || (IV_YUV_420P == s_app_ctx.e_output_chroma_format))
+ {
+ UWORD32 outlen;
+ ps_out_buf->u4_min_out_buf_size[0] =
+ s_ctl_op.u4_min_out_buf_size[0];
+ ps_out_buf->u4_min_out_buf_size[1] =
+ s_ctl_op.u4_min_out_buf_size[1];
+ ps_out_buf->u4_min_out_buf_size[2] =
+ s_ctl_op.u4_min_out_buf_size[2];
+
+ outlen = s_ctl_op.u4_min_out_buf_size[0];
+ if(s_ctl_op.u4_min_num_out_bufs > 1)
+ outlen += s_ctl_op.u4_min_out_buf_size[1];
+
+ if(s_ctl_op.u4_min_num_out_bufs > 2)
+ outlen += s_ctl_op.u4_min_out_buf_size[2];
+
+ ps_out_buf->pu1_bufs[0] = (UWORD8 *)malloc(outlen);
+ if(ps_out_buf->pu1_bufs[0] == NULL)
+ {
+ sprintf(ac_error_str,
+ "\nAllocation failure for output buffer of size %d",
+ outlen);
+ codec_exit(ac_error_str);
+ }
+
+ if(s_ctl_op.u4_min_num_out_bufs > 1)
+ ps_out_buf->pu1_bufs[1] = ps_out_buf->pu1_bufs[0]
+ + (s_ctl_op.u4_min_out_buf_size[0]);
+
+ if(s_ctl_op.u4_min_num_out_bufs > 2)
+ ps_out_buf->pu1_bufs[2] = ps_out_buf->pu1_bufs[1]
+ + (s_ctl_op.u4_min_out_buf_size[1]);
+
+ ps_out_buf->u4_num_bufs = s_ctl_op.u4_min_num_out_bufs;
+ }
+
+ }
+ }
+
+ }
+
+
+ /*************************************************************************/
+ /* set num of cores */
+ /*************************************************************************/
+ {
+
+ ihevcd_cxa_ctl_set_num_cores_ip_t s_ctl_set_cores_ip;
+ ihevcd_cxa_ctl_set_num_cores_op_t s_ctl_set_cores_op;
+
+ s_ctl_set_cores_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_set_cores_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_NUM_CORES;
+ s_ctl_set_cores_ip.u4_num_cores = s_app_ctx.u4_num_cores;
+ s_ctl_set_cores_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t);
+ s_ctl_set_cores_op.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_cores_ip,
+ (void *)&s_ctl_set_cores_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "\nError in setting number of cores");
+ codec_exit(ac_error_str);
+ }
+
+ }
+ /*************************************************************************/
+ /* set processsor */
+ /*************************************************************************/
+ {
+
+ ihevcd_cxa_ctl_set_processor_ip_t s_ctl_set_num_processor_ip;
+ ihevcd_cxa_ctl_set_processor_op_t s_ctl_set_num_processor_op;
+
+ s_ctl_set_num_processor_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_set_num_processor_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_PROCESSOR;
+ s_ctl_set_num_processor_ip.u4_arch = s_app_ctx.e_arch;
+ s_ctl_set_num_processor_ip.u4_soc = s_app_ctx.e_soc;
+ s_ctl_set_num_processor_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_ip_t);
+ s_ctl_set_num_processor_op.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_num_processor_ip,
+ (void *)&s_ctl_set_num_processor_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "\nError in setting Processor type");
+ codec_exit(ac_error_str);
+ }
+
+ }
+
+#ifdef GPU_BUILD
+ /*************************************************************************/
+ /* Enalbe/Disalbe GPU */
+ /*************************************************************************/
+ {
+
+ ihevcd_cxa_ctl_gpu_enable_diable_ip_t s_ctl_gpu_cnl_ip;
+ ihevcd_cxa_ctl_gpu_enable_diable_op_t s_ctl_gpu_cnl_op;
+
+ s_ctl_gpu_cnl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_gpu_cnl_ip.e_sub_cmd = IHEVCD_CXA_CMD_CTL_GPU_ENABLE_DISABLE;
+ s_ctl_gpu_cnl_ip.u4_gpu_enable_diable = s_app_ctx.u4_gpu_enable_diable;
+ s_ctl_gpu_cnl_ip.u4_size =
+ sizeof(ihevcd_cxa_ctl_gpu_enable_diable_ip_t);
+ s_ctl_gpu_cnl_op.u4_size =
+ sizeof(ihevcd_cxa_ctl_gpu_enable_diable_op_t);
+
+ ret = ivd_cxa_api_function(codec_obj, (void *)&s_ctl_gpu_cnl_ip,
+ (void *)&s_ctl_gpu_cnl_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "\nError enalbing/disabling GPU");
+ //codec_exit(ac_error_str);
+
+ }
+
+ }
+#endif
+
+ /*****************************************************************************/
+ /* Decode header to get width and height and buffer sizes */
+ /*****************************************************************************/
+ {
+
+ ivd_ctl_set_config_ip_t s_ctl_ip;
+ ivd_ctl_set_config_op_t s_ctl_op;
+
+ ivd_video_decode_ip_t s_video_decode_ip;
+ ivd_video_decode_op_t s_video_decode_op;
+
+ s_ctl_ip.u4_disp_wd = STRIDE;
+ if(1 == s_app_ctx.display)
+ s_ctl_ip.u4_disp_wd = s_app_ctx.get_stride();
+
+ s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+ s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+ s_ctl_ip.e_vid_dec_mode = IVD_DECODE_HEADER;
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str,
+ "\nError in setting the codec in header decode mode");
+ codec_exit(ac_error_str);
+ }
+
+ do
+ {
+ WORD32 numbytes;
+ if(0 == s_app_ctx.u4_piclen_flag)
+ {
+ fseek(ps_ip_file, file_pos, SEEK_SET);
+ numbytes = u4_ip_buf_len;
+ }
+ else
+ {
+ WORD32 entries;
+ entries = fscanf(ps_piclen_file, "%d\n", &numbytes);
+ if(1 != entries)
+ numbytes = u4_ip_buf_len;
+ }
+
+ u4_bytes_remaining = fread(pu1_bs_buf, sizeof(UWORD8), numbytes,
+ ps_ip_file);
+
+ if(0 == u4_bytes_remaining)
+ {
+ sprintf(ac_error_str, "\nUnable to read from input file");
+ codec_exit(ac_error_str);
+ }
+
+ s_video_decode_ip.e_cmd = IVD_CMD_VIDEO_DECODE;
+ s_video_decode_ip.u4_ts = u4_ip_frm_ts;
+ s_video_decode_ip.pv_stream_buffer = pu1_bs_buf;
+ s_video_decode_ip.u4_num_Bytes = u4_bytes_remaining;
+ s_video_decode_ip.u4_size = sizeof(ivd_video_decode_ip_t);
+ s_video_decode_op.u4_size = sizeof(ivd_video_decode_op_t);
+
+ /*****************************************************************************/
+ /* API Call: Header Decode */
+ /*****************************************************************************/
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_decode_ip,
+ (void *)&s_video_decode_op);
+
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "\nError in header decode %x",
+ s_video_decode_op.u4_error_code);
+ // codec_exit(ac_error_str);
+ }
+
+ u4_num_bytes_dec = s_video_decode_op.u4_num_bytes_consumed;
+#ifndef PROFILE_ENABLE
+ printf("%d\n", s_video_decode_op.u4_num_bytes_consumed);
+#endif
+ file_pos += u4_num_bytes_dec;
+ total_bytes_comsumed += u4_num_bytes_dec;
+ }while(ret != IV_SUCCESS);
+
+ /* copy pic_wd and pic_ht to initialize buffers */
+ s_app_ctx.u4_pic_wd = s_video_decode_op.u4_pic_wd;
+ s_app_ctx.u4_pic_ht = s_video_decode_op.u4_pic_ht;
+
+#if IOS_DISPLAY
+ s_app_ctx.i4_screen_wd = screen_wd;
+ s_app_ctx.i4_screen_ht = screen_ht;
+#endif
+
+ /* Create display thread and wait for the display buffers to be initialized */
+ if(1 == s_app_ctx.display)
+ {
+ if(0 == s_app_ctx.display_thread_created)
+ {
+ s_app_ctx.display_init_done = 0;
+ ithread_create(s_app_ctx.display_thread_handle, NULL,
+ (void *)&display_thread, (void *)&s_app_ctx);
+ s_app_ctx.display_thread_created = 1;
+
+ while(1)
+ {
+ if(s_app_ctx.display_init_done)
+ break;
+
+ ithread_msleep(1);
+ }
+ }
+
+ s_app_ctx.u4_strd = s_app_ctx.get_stride();
+ }
+ }
+
+ /*************************************************************************/
+ /* Get actual number of output buffers requried, which is dependent */
+ /* on stream properties such as width, height and level etc */
+ /* This is needed mainly for shared display mode */
+ /*************************************************************************/
+ //if(1 == s_app_ctx.share_disp_buf)
+ {
+ ivd_ctl_getbufinfo_ip_t s_ctl_ip;
+ ivd_ctl_getbufinfo_op_t s_ctl_op;
+ WORD32 outlen = 0;
+
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_GETBUFINFO;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_getbufinfo_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_getbufinfo_op_t);
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "Error in Get Buf Info %x", s_ctl_op.u4_error_code);
+ codec_exit(ac_error_str);
+ }
+
+#ifdef APP_EXTRA_BUFS
+ s_app_ctx.disp_delay = EXTRA_DISP_BUFFERS;
+ s_ctl_op.u4_num_disp_bufs += EXTRA_DISP_BUFFERS;
+#endif
+
+ /*****************************************************************************/
+ /* API Call: Allocate display buffers for display buffer shared case */
+ /*****************************************************************************/
+
+ for(i = 0; i < s_ctl_op.u4_num_disp_bufs; i++)
+ {
+
+ s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[0] =
+ s_ctl_op.u4_min_out_buf_size[0];
+ s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[1] =
+ s_ctl_op.u4_min_out_buf_size[1];
+ s_app_ctx.s_disp_buffers[i].u4_min_out_buf_size[2] =
+ s_ctl_op.u4_min_out_buf_size[2];
+
+ outlen = s_ctl_op.u4_min_out_buf_size[0];
+ if(s_ctl_op.u4_min_num_out_bufs > 1)
+ outlen += s_ctl_op.u4_min_out_buf_size[1];
+
+ if(s_ctl_op.u4_min_num_out_bufs > 2)
+ outlen += s_ctl_op.u4_min_out_buf_size[2];
+
+ s_app_ctx.s_disp_buffers[i].pu1_bufs[0] = (UWORD8 *)malloc(outlen);
+
+ if(s_app_ctx.s_disp_buffers[i].pu1_bufs[0] == NULL)
+ {
+ sprintf(ac_error_str,
+ "\nAllocation failure for output buffer of size %d",
+ outlen);
+ codec_exit(ac_error_str);
+ }
+
+ if(s_ctl_op.u4_min_num_out_bufs > 1)
+ s_app_ctx.s_disp_buffers[i].pu1_bufs[1] =
+ s_app_ctx.s_disp_buffers[i].pu1_bufs[0]
+ + (s_ctl_op.u4_min_out_buf_size[0]);
+
+ if(s_ctl_op.u4_min_num_out_bufs > 2)
+ s_app_ctx.s_disp_buffers[i].pu1_bufs[2] =
+ s_app_ctx.s_disp_buffers[i].pu1_bufs[1]
+ + (s_ctl_op.u4_min_out_buf_size[1]);
+
+ s_app_ctx.s_disp_buffers[i].u4_num_bufs =
+ s_ctl_op.u4_min_num_out_bufs;
+ }
+ s_app_ctx.num_disp_buf = s_ctl_op.u4_num_disp_bufs;
+
+ /*****************************************************************************/
+ /* API Call: Send the allocated display buffers to codec */
+ /*****************************************************************************/
+ {
+ ivd_set_display_frame_ip_t s_set_display_frame_ip;
+ ivd_set_display_frame_op_t s_set_display_frame_op;
+
+ s_set_display_frame_ip.e_cmd = IVD_CMD_SET_DISPLAY_FRAME;
+ s_set_display_frame_ip.u4_size = sizeof(ivd_set_display_frame_ip_t);
+ s_set_display_frame_op.u4_size = sizeof(ivd_set_display_frame_op_t);
+
+ s_set_display_frame_ip.num_disp_bufs = s_app_ctx.num_disp_buf;
+
+ memcpy(&(s_set_display_frame_ip.s_disp_buffer),
+ &(s_app_ctx.s_disp_buffers),
+ s_ctl_op.u4_num_disp_bufs * sizeof(ivd_out_bufdesc_t));
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj,
+ (void *)&s_set_display_frame_ip,
+ (void *)&s_set_display_frame_op);
+
+ if(IV_SUCCESS != ret)
+ {
+ sprintf(ac_error_str, "Error in Set display frame");
+ codec_exit(ac_error_str);
+ }
+
+ }
+
+ }
+
+ /*************************************************************************/
+ /* Get frame dimensions for display buffers such as x_offset,y_offset */
+ /* etc. This information might be needed to set display buffer */
+ /* offsets in case of shared display buffer mode */
+ /*************************************************************************/
+ {
+
+ ihevcd_cxa_ctl_get_frame_dimensions_ip_t s_ctl_get_frame_dimensions_ip;
+ ihevcd_cxa_ctl_get_frame_dimensions_op_t s_ctl_get_frame_dimensions_op;
+
+ s_ctl_get_frame_dimensions_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_get_frame_dimensions_ip.e_sub_cmd =
+ (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_GET_BUFFER_DIMENSIONS;
+ s_ctl_get_frame_dimensions_ip.u4_size =
+ sizeof(ihevcd_cxa_ctl_get_frame_dimensions_ip_t);
+ s_ctl_get_frame_dimensions_op.u4_size =
+ sizeof(ihevcd_cxa_ctl_get_frame_dimensions_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_get_frame_dimensions_ip,
+ (void *)&s_ctl_get_frame_dimensions_op);
+ if(IV_SUCCESS != ret)
+ {
+ sprintf(ac_error_str, "Error in Get buffer Dimensions");
+ codec_exit(ac_error_str);
+ }
+
+/*
+ printf("Frame offsets due to padding\n");
+ printf("s_ctl_get_frame_dimensions_op.x_offset[0] %d s_ctl_get_frame_dimensions_op.y_offset[0] %d\n",
+ s_ctl_get_frame_dimensions_op.u4_x_offset[0],
+ s_ctl_get_frame_dimensions_op.u4_y_offset[0]);
+*/
+ }
+
+
+ /*************************************************************************/
+ /* Get VUI parameters */
+ /*************************************************************************/
+ {
+
+ ihevcd_cxa_ctl_get_vui_params_ip_t s_ctl_get_vui_params_ip;
+ ihevcd_cxa_ctl_get_vui_params_op_t s_ctl_get_vui_params_op;
+
+ s_ctl_get_vui_params_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_get_vui_params_ip.e_sub_cmd =
+ (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_GET_VUI_PARAMS;
+ s_ctl_get_vui_params_ip.u4_size =
+ sizeof(ihevcd_cxa_ctl_get_vui_params_ip_t);
+ s_ctl_get_vui_params_op.u4_size =
+ sizeof(ihevcd_cxa_ctl_get_vui_params_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_get_vui_params_ip,
+ (void *)&s_ctl_get_vui_params_op);
+ if(IV_SUCCESS != ret)
+ {
+ sprintf(ac_error_str, "Error in Get VUI params");
+ //codec_exit(ac_error_str);
+ }
+
+ }
+
+
+ /*************************************************************************/
+ /* Set the decoder in frame decode mode. It was set in header decode */
+ /* mode earlier */
+ /*************************************************************************/
+ {
+
+ ivd_ctl_set_config_ip_t s_ctl_ip;
+ ivd_ctl_set_config_op_t s_ctl_op;
+
+ s_ctl_ip.u4_disp_wd = STRIDE;
+ if(1 == s_app_ctx.display)
+ s_ctl_ip.u4_disp_wd = s_app_ctx.get_stride();
+ s_ctl_ip.e_frm_skip_mode = IVD_SKIP_NONE;
+
+ s_ctl_ip.e_frm_out_mode = IVD_DISPLAY_FRAME_OUT;
+ s_ctl_ip.e_vid_dec_mode = IVD_DECODE_FRAME;
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_SETPARAMS;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_set_config_ip_t);
+
+ s_ctl_op.u4_size = sizeof(ivd_ctl_set_config_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip, (void *)&s_ctl_op);
+
+ if(IV_SUCCESS != ret)
+ {
+ sprintf(ac_error_str, "Error in Set Parameters");
+ //codec_exit(ac_error_str);
+ }
+
+ }
+ /*************************************************************************/
+ /* If required disable deblocking and sao at given level */
+ /*************************************************************************/
+ set_degrade(codec_obj, s_app_ctx.i4_degrade_type, s_app_ctx.i4_degrade_pics);
+#ifdef X86_MSVC
+ QueryPerformanceFrequency(&frequency);
+#endif
+#ifndef PRINT_PICSIZE
+ get_version(codec_obj);
+#endif
+ while(u4_op_frm_ts < (s_app_ctx.u4_max_frm_ts + s_app_ctx.disp_delay))
+ {
+
+#ifdef TEST_FLUSH
+ if(u4_ip_frm_ts == FLUSH_FRM_CNT)
+ {
+ ivd_ctl_flush_ip_t s_ctl_ip;
+ ivd_ctl_flush_op_t s_ctl_op;
+
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_FLUSH;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_flush_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_flush_op_t);
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+
+ if(ret != IV_SUCCESS)
+ {
+ printf("Error in Setting the decoder in flush mode\n");
+ }
+ file_pos = 0;
+
+ fseek(ps_ip_file, file_pos, SEEK_SET);
+
+ }
+#endif
+ if(u4_ip_frm_ts < s_app_ctx.num_disp_buf)
+ {
+ release_disp_frame(codec_obj, u4_ip_frm_ts);
+ }
+
+
+ /*************************************************************************/
+ /* set num of cores */
+ /*************************************************************************/
+#ifdef DYNAMIC_NUMCORES
+ {
+
+ ihevcd_cxa_ctl_set_num_cores_ip_t s_ctl_set_cores_ip;
+ ihevcd_cxa_ctl_set_num_cores_op_t s_ctl_set_cores_op;
+
+ s_ctl_set_cores_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_set_cores_ip.e_sub_cmd = IHEVCD_CXA_CMD_CTL_SET_NUM_CORES;
+ s_ctl_set_cores_ip.u4_num_cores = 1 + 3 * (u4_ip_frm_ts % 2);
+ s_ctl_set_cores_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t);
+ s_ctl_set_cores_op.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_cores_ip,
+ (void *)&s_ctl_set_cores_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "\nError in setting number of cores");
+ codec_exit(ac_error_str);
+ }
+
+ }
+#endif
+ /***********************************************************************/
+ /* Seek the file to start of current frame, this is equavelent of */
+ /* having a parcer which tells the start of current frame */
+ /***********************************************************************/
+ {
+ WORD32 numbytes;
+
+ if(0 == s_app_ctx.u4_piclen_flag)
+ {
+ fseek(ps_ip_file, file_pos, SEEK_SET);
+ numbytes = u4_ip_buf_len;
+ }
+ else
+ {
+ WORD32 entries;
+ entries = fscanf(ps_piclen_file, "%d\n", &numbytes);
+ if(1 != entries)
+ numbytes = u4_ip_buf_len;
+ }
+
+ u4_bytes_remaining = fread(pu1_bs_buf, sizeof(UWORD8),
+ numbytes, ps_ip_file);
+
+ if(u4_bytes_remaining == 0)
+ {
+ if(1 == s_app_ctx.loopback)
+ {
+ file_pos = 0;
+ if(0 == s_app_ctx.u4_piclen_flag)
+ {
+ fseek(ps_ip_file, file_pos, SEEK_SET);
+ numbytes = u4_ip_buf_len;
+ }
+ else
+ {
+ WORD32 entries;
+ entries = fscanf(ps_piclen_file, "%d\n", &numbytes);
+ if(1 != entries)
+ numbytes = u4_ip_buf_len;
+ }
+
+
+ u4_bytes_remaining = fread(pu1_bs_buf, sizeof(UWORD8),
+ numbytes, ps_ip_file);
+ }
+ else
+ break;
+ }
+ }
+
+ /*********************************************************************/
+ /* Following calls can be enabled at diffent times */
+ /*********************************************************************/
+#if ENABLE_DEGRADE
+ if(u4_op_frm_ts >= 10000)
+ disable_deblocking(codec_obj, 4);
+
+ if(u4_op_frm_ts == 30000)
+ enable_deblocking(codec_obj);
+
+ if(u4_op_frm_ts == 10000)
+ enable_skippb_frames(codec_obj);
+
+ if(u4_op_frm_ts == 60000)
+ disable_skippb_frames(codec_obj);
+
+ if(u4_op_frm_ts == 30000)
+ enable_skipb_frames(codec_obj);
+
+ if(u4_op_frm_ts == 60000)
+ disable_skipb_frames(codec_obj);
+#endif
+
+
+ {
+ ivd_video_decode_ip_t s_video_decode_ip;
+ ivd_video_decode_op_t s_video_decode_op;
+#ifdef PROFILE_ENABLE
+ UWORD32 s_elapsed_time;
+ TIMER s_start_timer;
+ TIMER s_end_timer;
+#endif
+
+
+ s_video_decode_ip.e_cmd = IVD_CMD_VIDEO_DECODE;
+ s_video_decode_ip.u4_ts = u4_ip_frm_ts;
+ s_video_decode_ip.pv_stream_buffer = pu1_bs_buf;
+ s_video_decode_ip.u4_num_Bytes = u4_bytes_remaining;
+ s_video_decode_ip.u4_size = sizeof(ivd_video_decode_ip_t);
+ s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[0] =
+ ps_out_buf->u4_min_out_buf_size[0];
+ s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[1] =
+ ps_out_buf->u4_min_out_buf_size[1];
+ s_video_decode_ip.s_out_buffer.u4_min_out_buf_size[2] =
+ ps_out_buf->u4_min_out_buf_size[2];
+
+ s_video_decode_ip.s_out_buffer.pu1_bufs[0] =
+ ps_out_buf->pu1_bufs[0];
+ s_video_decode_ip.s_out_buffer.pu1_bufs[1] =
+ ps_out_buf->pu1_bufs[1];
+ s_video_decode_ip.s_out_buffer.pu1_bufs[2] =
+ ps_out_buf->pu1_bufs[2];
+ s_video_decode_ip.s_out_buffer.u4_num_bufs =
+ ps_out_buf->u4_num_bufs;
+ s_video_decode_op.u4_size = sizeof(ivd_video_decode_op_t);
+
+ /* Get display buffer pointers */
+ if(1 == s_app_ctx.display)
+ {
+ WORD32 wr_idx;
+
+ wr_idx = dispq_producer_dequeue(&s_app_ctx);
+
+ if(s_app_ctx.quit)
+ break;
+
+ s_app_ctx.set_disp_buffers(s_app_ctx.pv_disp_ctx, wr_idx,
+ &s_video_decode_ip.s_out_buffer.pu1_bufs[0],
+ &s_video_decode_ip.s_out_buffer.pu1_bufs[1],
+ &s_video_decode_ip.s_out_buffer.pu1_bufs[2]);
+ }
+
+ /*****************************************************************************/
+ /* API Call: Video Decode */
+ /*****************************************************************************/
+
+ GETTIME(&s_start_timer);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_video_decode_ip,
+ (void *)&s_video_decode_op);
+
+
+ GETTIME(&s_end_timer);
+ ELAPSEDTIME(s_start_timer, s_end_timer, s_elapsed_time, frequency);
+#ifdef PROFILE_ENABLE
+ {
+ UWORD32 peak_avg, id;
+ u4_tot_cycles += s_elapsed_time;
+ peak_window[peak_window_idx++] = s_elapsed_time;
+ if(peak_window_idx == PEAK_WINDOW_SIZE)
+ peak_window_idx = 0;
+ peak_avg = 0;
+ for(id = 0; id < PEAK_WINDOW_SIZE; id++)
+ {
+ peak_avg += peak_window[id];
+ }
+ peak_avg /= PEAK_WINDOW_SIZE;
+ if(peak_avg > peak_avg_max)
+ peak_avg_max = peak_avg;
+ frm_cnt++;
+
+ printf("FrameNum: %4d TimeTaken(microsec): %6d AvgTime: %6d PeakAvgTimeMax: %6d Output: %2d NumBytes: %6d \n",
+ frm_cnt, s_elapsed_time, u4_tot_cycles / frm_cnt, peak_avg_max, s_video_decode_op.u4_output_present, s_video_decode_op.u4_num_bytes_consumed);
+
+ }
+#ifdef INTEL_CE5300
+ time_consumed += s_elapsed_time;
+ bytes_consumed += s_video_decode_op.u4_num_bytes_consumed;
+ if(!(frm_cnt % (s_app_ctx.fps)))
+ {
+ time_consumed = time_consumed / s_app_ctx.fps;
+ printf("Average decode time(micro sec) for the last second = %6d\n", time_consumed);
+ printf("Average bitrate(kb) for the last second = %6d\n", (bytes_consumed * 8) / 1024);
+ time_consumed = 0;
+ bytes_consumed = 0;
+
+ }
+#endif
+#else
+ printf("%d\n", s_video_decode_op.u4_num_bytes_consumed);
+#endif
+
+ if(ret != IV_SUCCESS)
+ {
+ printf("Error in video Frame decode : ret %x Error %x\n", ret,
+ s_video_decode_op.u4_error_code);
+ }
+
+ if((IV_SUCCESS != ret) &&
+ ((s_video_decode_op.u4_error_code & 0xFF) == IVD_RES_CHANGED))
+ {
+ ivd_ctl_reset_ip_t s_ctl_ip;
+ ivd_ctl_reset_op_t s_ctl_op;
+
+ flush_output(codec_obj, &s_app_ctx, ps_out_buf,
+ pu1_bs_buf, &u4_op_frm_ts,
+ ps_op_file, ps_op_chksum_file,
+ u4_ip_frm_ts, u4_bytes_remaining);
+
+ s_ctl_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_ip.e_sub_cmd = IVD_CMD_CTL_RESET;
+ s_ctl_ip.u4_size = sizeof(ivd_ctl_reset_ip_t);
+ s_ctl_op.u4_size = sizeof(ivd_ctl_reset_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_ip,
+ (void *)&s_ctl_op);
+ if(IV_SUCCESS != ret)
+ {
+ sprintf(ac_error_str, "Error in Reset");
+ codec_exit(ac_error_str);
+ }
+ /*************************************************************************/
+ /* set num of cores */
+ /*************************************************************************/
+ {
+
+ ihevcd_cxa_ctl_set_num_cores_ip_t s_ctl_set_cores_ip;
+ ihevcd_cxa_ctl_set_num_cores_op_t s_ctl_set_cores_op;
+
+ s_ctl_set_cores_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_set_cores_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_NUM_CORES;
+ s_ctl_set_cores_ip.u4_num_cores = s_app_ctx.u4_num_cores;
+ s_ctl_set_cores_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_ip_t);
+ s_ctl_set_cores_op.u4_size = sizeof(ihevcd_cxa_ctl_set_num_cores_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_cores_ip,
+ (void *)&s_ctl_set_cores_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "\nError in setting number of cores");
+ codec_exit(ac_error_str);
+ }
+
+ }
+ /*************************************************************************/
+ /* set processsor */
+ /*************************************************************************/
+ {
+
+ ihevcd_cxa_ctl_set_processor_ip_t s_ctl_set_num_processor_ip;
+ ihevcd_cxa_ctl_set_processor_op_t s_ctl_set_num_processor_op;
+
+ s_ctl_set_num_processor_ip.e_cmd = IVD_CMD_VIDEO_CTL;
+ s_ctl_set_num_processor_ip.e_sub_cmd = (IVD_CONTROL_API_COMMAND_TYPE_T)IHEVCD_CXA_CMD_CTL_SET_PROCESSOR;
+ s_ctl_set_num_processor_ip.u4_arch = s_app_ctx.e_arch;
+ s_ctl_set_num_processor_ip.u4_soc = s_app_ctx.e_soc;
+ s_ctl_set_num_processor_ip.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_ip_t);
+ s_ctl_set_num_processor_op.u4_size = sizeof(ihevcd_cxa_ctl_set_processor_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_ctl_set_num_processor_ip,
+ (void *)&s_ctl_set_num_processor_op);
+ if(ret != IV_SUCCESS)
+ {
+ sprintf(ac_error_str, "\nError in setting Processor type");
+ codec_exit(ac_error_str);
+ }
+
+ }
+ }
+
+
+ if((1 == s_app_ctx.display) &&
+ (1 == s_video_decode_op.u4_output_present))
+ {
+ dispq_producer_queue(&s_app_ctx);
+ }
+
+ if(IV_B_FRAME == s_video_decode_op.e_pic_type)
+ s_app_ctx.b_pic_present |= 1;
+
+ u4_num_bytes_dec = s_video_decode_op.u4_num_bytes_consumed;
+
+ file_pos += u4_num_bytes_dec;
+ total_bytes_comsumed += u4_num_bytes_dec;
+ u4_ip_frm_ts++;
+
+
+ if(1 == s_video_decode_op.u4_output_present)
+ {
+ width = s_video_decode_op.s_disp_frm_buf.u4_y_wd;
+ height = s_video_decode_op.s_disp_frm_buf.u4_y_ht;
+ dump_output(&s_app_ctx, &(s_video_decode_op.s_disp_frm_buf),
+ s_video_decode_op.u4_disp_buf_id, ps_op_file,
+ ps_op_chksum_file,
+ u4_op_frm_ts, s_app_ctx.u4_file_save_flag,
+ s_app_ctx.u4_chksum_save_flag);
+
+ u4_op_frm_ts++;
+ }
+ else
+ {
+ if((s_video_decode_op.u4_error_code >> IVD_FATALERROR) & 1)
+ {
+ printf("Fatal error\n");
+ break;
+ }
+ }
+
+ }
+ }
+
+ /***********************************************************************/
+ /* To get the last decoded frames, call process with NULL input */
+ /***********************************************************************/
+ flush_output(codec_obj, &s_app_ctx, ps_out_buf,
+ pu1_bs_buf, &u4_op_frm_ts,
+ ps_op_file, ps_op_chksum_file,
+ u4_ip_frm_ts, u4_bytes_remaining);
+
+ /* set disp_end flag */
+ s_app_ctx.quit = 1;
+
+
+#ifdef PROFILE_ENABLE
+ printf("Summary\n");
+ printf("Input filename : %s\n", s_app_ctx.ac_ip_fname);
+ printf("Output Width : %-4d\n", width);
+ printf("Output Height : %-4d\n", height);
+
+ if(frm_cnt)
+ {
+ double avg = u4_tot_cycles / frm_cnt;
+ double bytes_avg = total_bytes_comsumed / frm_cnt;
+ double bitrate = (bytes_avg * 8 * s_app_ctx.fps) / 1000000;
+ printf("Bitrate @ %2d fps(mbps) : %-6.2f\n", s_app_ctx.fps, bitrate);
+ printf("Average decode time(micro sec) : %-6d\n", (WORD32)avg);
+ printf("Avg Peak decode time(%2d frames) : %-6d\n", PEAK_WINDOW_SIZE, (WORD32)peak_avg_max);
+ avg = (u4_tot_cycles + u4_tot_fmt_cycles) * 1.0 / frm_cnt;
+
+ if(0 == s_app_ctx.share_disp_buf)
+ printf("FPS achieved (with format conv) : %-3.2f\n", 1000000 / avg);
+ else
+ printf("FPS achieved : %-3.2f\n", 1000000 / avg);
+ }
+#endif
+ /***********************************************************************/
+ /* Clear the decoder, close all the files, free all the memory */
+ /***********************************************************************/
+ if(1 == s_app_ctx.display)
+ {
+ s_app_ctx.display_deinit_flag = 1;
+ /* wait for display to finish */
+ if(s_app_ctx.display_thread_created)
+ {
+ ithread_join(s_app_ctx.display_thread_handle, NULL);
+ }
+ free(s_app_ctx.display_thread_handle);
+ }
+
+ {
+ iv_retrieve_mem_rec_ip_t s_retrieve_dec_ip;
+ iv_retrieve_mem_rec_op_t s_retrieve_dec_op;
+ s_retrieve_dec_ip.pv_mem_rec_location = (iv_mem_rec_t *)pv_mem_rec_location;
+
+ s_retrieve_dec_ip.e_cmd = IV_CMD_RETRIEVE_MEMREC;
+ s_retrieve_dec_ip.u4_size = sizeof(iv_retrieve_mem_rec_ip_t);
+ s_retrieve_dec_op.u4_size = sizeof(iv_retrieve_mem_rec_op_t);
+
+ ret = ivd_cxa_api_function((iv_obj_t *)codec_obj, (void *)&s_retrieve_dec_ip,
+ (void *)&s_retrieve_dec_op);
+
+ if(IV_SUCCESS != ret)
+ {
+ sprintf(ac_error_str, "Error in Retrieve Memrec");
+ codec_exit(ac_error_str);
+ }
+
+ {
+ iv_mem_rec_t *ps_mem_rec;
+ UWORD16 u2_i;
+
+ u4_num_mem_recs = s_retrieve_dec_op.u4_num_mem_rec_filled;
+
+ ps_mem_rec = s_retrieve_dec_ip.pv_mem_rec_location;
+
+ for(u2_i = 0; u2_i < u4_num_mem_recs; u2_i++)
+ {
+ ihevca_aligned_free(ps_mem_rec->pv_base);
+ ps_mem_rec++;
+ }
+ free(s_retrieve_dec_ip.pv_mem_rec_location);
+ }
+
+ }
+ /***********************************************************************/
+ /* Close all the files and free all the memory */
+ /***********************************************************************/
+ {
+ fclose(ps_ip_file);
+
+ if(1 == s_app_ctx.u4_file_save_flag)
+ {
+ fclose(ps_op_file);
+ }
+ if(1 == s_app_ctx.u4_chksum_save_flag)
+ {
+ fclose(ps_op_chksum_file);
+ }
+
+ }
+
+ if(0 == s_app_ctx.share_disp_buf)
+ {
+ free(ps_out_buf->pu1_bufs[0]);
+ }
+
+ for(i = 0; i < s_app_ctx.num_disp_buf; i++)
+ {
+ free(s_app_ctx.s_disp_buffers[i].pu1_bufs[0]);
+ }
+
+ free(ps_out_buf);
+ free(pu1_bs_buf);
+
+ return (0);
+}
diff --git a/test/decoder/test.cfg b/test/decoder/test.cfg
new file mode 100644
index 0000000..036261a
--- /dev/null
+++ b/test/decoder/test.cfg
@@ -0,0 +1,32 @@
+/******************************************************************************
+*
+* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at:
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************************/
+--input \\is0189\ARM\work\reference\HEVC\HM-10.0\bin\crew_720p_2mbps.265
+--save_output 0
+--num_frames -1
+--output E:\hevc_decoder\out.yuv
+--chroma_format YUV_420P
+--share_display_buf 0
+--max_wd 1920
+--max_ht 1080
+--max_level 41
+--num_cores 1
+--loopback 1
+--display 1
+--fps 30
+--arch X86_GENERIC
+--soc GENERIC